<a href="https://colab.research.google.com/github/inspire007/KaggleCompetitions/blob/main/Titanic%20-%20Machine%20Learning%20from%20Disaster%20-%20Kaggle/Titanic_Machine_Learning_from_Disaster_kfoldcv_gridsearch_Kaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import pandas as pd
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

data = pd.read_csv('train.csv')
X = data.iloc[:, [2,4,5,6,7,11]]
y = data.iloc[:, 1].values

num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_include='object')

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
      ('num', num_pipe, num_cols),
      ('cat', cat_pipe, cat_cols)
], remainder = 'passthrough')

X = preprocessor.fit_transform(X)

models = {
    'SVC': SVC(kernel='rbf'),
    'RandomForest': RandomForestClassifier(n_estimators=500, random_state=42),
    'DecTree': DecisionTreeClassifier(),
    'LogisticR': LogisticRegression(),
    'XGBClassifier': XGBClassifier(),
    'GaussianNB': GaussianNB()
}

max_acc = 0
selected_model = False

for name,model in models.items():
  cv_score = cross_val_score(estimator=model, X = X, y = y, cv = 20)
  mean = cv_score.mean()
  std = cv_score.std()
  print(f'{model} accuracy: {mean*100 : .2f}, {std*100 : .2f}')
  if(mean > max_acc):
    max_acc = mean
    selected_model = model

print('Selected model based on max accuracy: ', selected_model)

#since SVC is selected, we would do parameter tuning using gridsearchcv
#now grid searchCV
params = [
    {'C': list(np.arange(.25, 1, .25)), 'kernel': ['linear']},
    {'C': list(np.arange(.25, 1, .25)), 'kernel': ['rbf'], 'gamma': list(np.arange(0.1,0.9,0.1))}
]

gridCV = GridSearchCV(estimator=selected_model, param_grid=params, scoring = 'accuracy', cv = 20)
gridCVRun = gridCV.fit(X, y)

accuracy = gridCVRun.best_score_
best_param = gridCVRun.best_params_

print("New accuracy: ", accuracy, "\nParams: ", best_param, '\n')

#model tuning based on gridsearchcv
final_model = SVC(kernel='rbf', C=0.5, gamma=0.2)
final_model.fit(X, y)

#now predict given data sample
data = pd.read_csv('test.csv')
X_test_actual = data.iloc[:, [1,3,4,5,6,10]]
X_test_actual = preprocessor.transform(X_test_actual)

y_pred_actual = final_model.predict(X_test_actual)
pid = data['PassengerId'].values

np.savetxt('output.csv', np.column_stack((pid,y_pred_actual)), header="PassengerId,Survived", fmt ="%d", delimiter=',', comments='')

SVC() accuracy:  82.04,  4.63
RandomForestClassifier(n_estimators=500, random_state=42) accuracy:  80.93,  6.79
DecisionTreeClassifier() accuracy:  78.79,  5.22
LogisticRegression() accuracy:  80.14,  4.09
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...) accuracy:  80.69,  4.92
GaussianNB()