In [1]:
# imports
import warnings
warnings.filterwarnings('ignore')
import os
import math
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi'] = 125

# machine learning
from sklearn.model_selection import KFold,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import xgboost as xgb

In [2]:
train_df=pd.read_csv('train_clean.csv')
test_df=pd.read_csv('test_clean.csv')
y=train_df.pop("Survived").to_numpy()
X=train_df.to_numpy()
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (891, 9)
y shape: (891,)


In [3]:
# Grid Search on multiple models
# Initialze the estimators

knn=KNeighborsClassifier()
mnb=MultinomialNB()
lr=LogisticRegression(random_state=42)
sgd=SGDClassifier(random_state=42,loss='log')
svc=SVC(probability=True, random_state=42)
dt=DecisionTreeClassifier(random_state=42)
rf=RandomForestClassifier(random_state=42)
gb=GradientBoostingClassifier(random_state=42)
mlp=MLPClassifier(random_state=42)
xgb_cl=xgb.XGBClassifier(random_state=42)


estimators={"K Nearest neighbors":knn,
            "Multi Nomial Naive Bayes":mnb,
            "Logitic Regression":lr,
            "SGD":sgd,
            "Support Vectors":svc,
            "Decision Trees":dt,
            "Random Forests":rf,
            "Gradient Boosting":gb,
            "Neural Network":mlp,
            "XGB":xgb_cl}

In [4]:
kfold = KFold(n_splits=10,random_state=42).split(X, y)

i = 1
# split()  method generate indices to split data into training and test set.
for train_index, test_index in kfold:
    print(f'Fold:{i}, Train set: {len(train_index)}, Test set:{len(test_index)}')
    i+=1

Fold:1, Train set: 801, Test set:90
Fold:2, Train set: 802, Test set:89
Fold:3, Train set: 802, Test set:89
Fold:4, Train set: 802, Test set:89
Fold:5, Train set: 802, Test set:89
Fold:6, Train set: 802, Test set:89
Fold:7, Train set: 802, Test set:89
Fold:8, Train set: 802, Test set:89
Fold:9, Train set: 802, Test set:89
Fold:10, Train set: 802, Test set:89


In [5]:
def fit_multi(X,y,metric):
    results=pd.DataFrame(columns=["Estimator",metric])
    for est in estimators.items():      
        kfold = KFold(n_splits=10,random_state=42).split(X, y)
        if metric=="accuracy":             
            score = cross_val_score(est[1], X, y, cv= kfold, scoring="accuracy",n_jobs=-1)
            vals=[est[0],score.mean()]                    
        elif metric=="logloss":   
            score = cross_val_score(est[1], X, y, cv= kfold, scoring="neg_log_loss",n_jobs=-1)       
            vals=[est[0],-score.mean()]
        results.loc[len(results)]=vals        
        results.sort_values(by=metric,ascending=False, inplace=True)
    return results

# refitting multiple models
accuracy_results=fit_multi(X,y,metric='accuracy')
logloss_results=fit_multi(X,y,metric='logloss')
results=accuracy_results.merge(logloss_results, on='Estimator')
results.sort_values(by='logloss',ascending=True, inplace=True)
results

Unnamed: 0,Estimator,accuracy,logloss
0,Gradient Boosting,0.816005,0.425819
6,Neural Network,0.795755,0.465074
3,Support Vectors,0.802459,0.479043
1,XGB,0.816005,0.482913
7,Logitic Regression,0.793508,0.496929
9,Multi Nomial Naive Bayes,0.699338,0.685384
2,Random Forests,0.814869,0.84834
8,SGD,0.774569,1.897962
4,K Nearest neighbors,0.800325,1.898817
5,Decision Trees,0.79804,3.279904


In [6]:
# Sorting the log loss in desc and choosing the top 5 models for tuning
print(list(results.Estimator[:5]))

['Gradient Boosting', 'Neural Network', 'Support Vectors', 'XGB', 'Logitic Regression']


In [7]:
# plot_confusion_matrix(gs, X_test, y_test,cmap='Blues')  
# plt.show()

In [8]:
# plot_roc_curve(gs, X_test, y_test,response_method='predict_proba')  
# plt.plot([0, 1], [0, 1], color="red", lw=1, linestyle="--")
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.show()