# Titanic - Hyperparameter tuning with GridSearchCV

![](https://storage.googleapis.com/kaggle-competitions/kaggle/3136/logos/header.png)

<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='background:#7ca4cd; border:0' role="tab" aria-controls="home"><center>Quick Navigation</center></h3>

* [1. Data load and feature engineering](#1)
* [2. Decision Tree](#2)
* [3. Random Forest](#3)
* [4. XGBoost](#4)
* [5. LightGBM](#5)
* [6. CatBoost](#6)
* [7. Submission](#7)

In [99]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [100]:
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    

SEED = 42
set_seed(SEED)

<a id="1"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Data load and feature engineering<center><h2>

In [101]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [102]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [103]:
y_train = train_df['Survived'].values
y_train.shape

(891,)

In [104]:
used_columns = ['Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked']
full_df = pd.concat([train_df[used_columns], test_df[used_columns]])
full_df.shape

(1309, 8)

In [105]:
full_df.isna().sum()

Pclass      0
Name        0
Sex         0
SibSp       0
Parch       0
Ticket      0
Fare        1
Embarked    2
dtype: int64

In [106]:
full_df['Embarked'].value_counts()

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [107]:
full_df['Embarked'].fillna('S', inplace=True)
full_df['Fare'].fillna(full_df['Fare'].median(), inplace=True)

In [108]:
full_df['Title'] = full_df['Name'].str.extract(' ([A-Za-z]+)\.')
full_df['Title'] = full_df['Title'].replace(['Ms', 'Mlle'], 'Miss')
full_df['Title'] = full_df['Title'].replace(['Mme', 'Countess', 'Lady', 'Dona'], 'Mrs')
full_df['Title'] = full_df['Title'].replace(['Dr', 'Major', 'Col', 'Sir', 'Rev', 'Jonkheer', 'Capt', 'Don'], 'Mr')

In [109]:
full_df["Sex"] = full_df["Sex"].map({"male": 1, "female": 0}).astype(int)    
full_df["Embarked"] = full_df["Embarked"].map({"S": 1, "C": 2, "Q": 3}).astype(int)    
full_df['Title'] = full_df['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3}).astype(int)   

In [110]:
full_df['TicketNumber'] = full_df['Ticket'].str.split()
full_df['TicketNumber'] = full_df['TicketNumber'].str[-1]
full_df['TicketNumber'] = LabelEncoder().fit_transform(full_df['TicketNumber'])

In [111]:
full_df = full_df.drop(['Name', 'Ticket'], axis=1)

In [112]:
full_df['FamilySize'] = full_df['SibSp'] + full_df['Parch'] + 1
full_df['IsAlone'] = full_df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

In [113]:
full_df.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Fare,Embarked,Title,TicketNumber,FamilySize,IsAlone
0,3,1,1,0,7.25,1,0,209,2,0
1,1,0,1,0,71.2833,2,2,166,2,0
2,3,0,0,0,7.925,1,1,466,1,1
3,1,0,1,0,53.1,1,2,67,2,0
4,3,1,0,0,8.05,1,0,832,1,1


In [114]:
categorical_columns = ['Pclass', 'Sex', 'Parch', 'Embarked', 'Title', 'TicketNumber', 'IsAlone']

In [115]:
X_train = full_df[:y_train.shape[0]]
X_test = full_df[y_train.shape[0]:]
X_train.shape, y_train.shape, X_test.shape

((891, 10), (891,), (418, 10))

<a id="2"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Decision Tree<center><h2>

In [116]:
%%time
parameters = {
    "max_depth": [3, 5, 7, 9, 11, 13],
}

model_desicion_tree = DecisionTreeClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_desicion_tree = GridSearchCV(
    model_desicion_tree, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_desicion_tree.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_desicion_tree.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + \
    f'{model_desicion_tree.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'max_depth': 11}
Mean cross-validated accuracy score of the best_estimator: 0.815
-----
CPU times: user 210 ms, sys: 833 µs, total: 211 ms
Wall time: 210 ms


In [117]:
%%time
parameters = {
    "max_depth": [9, 10,11, 12,13],
}

model_desicion_tree = DecisionTreeClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_desicion_tree = GridSearchCV(
    model_desicion_tree, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_desicion_tree.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_desicion_tree.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + \
    f'{model_desicion_tree.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'max_depth': 11}
Mean cross-validated accuracy score of the best_estimator: 0.815
-----
CPU times: user 183 ms, sys: 2.91 ms, total: 186 ms
Wall time: 182 ms


<a id="3"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Random Forest<center><h2>

In [118]:
%%time
parameters = {
    "n_estimators": [5, 10, 15, 20, 25], 
    "max_depth": [3, 5, 7, 9, 11, 13],
}

model_random_forest = RandomForestClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_random_forest = GridSearchCV(
    model_random_forest, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_random_forest.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_random_forest.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model_random_forest.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'max_depth': 9, 'n_estimators': 20}
Mean cross-validated accuracy score of the best_estimator: 0.843
-----
CPU times: user 4.38 s, sys: 2.99 ms, total: 4.38 s
Wall time: 4.38 s


In [119]:
%%time
parameters = {
    "n_estimators": [ 18,19,20,21,22,23,24, 25], 
    "max_depth": [ 8,9,10 ,11,12 ,13],
}

model_random_forest = RandomForestClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_random_forest = GridSearchCV(
    model_random_forest, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_random_forest.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_random_forest.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: '+ \
    f'{model_random_forest.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'max_depth': 10, 'n_estimators': 25}
Mean cross-validated accuracy score of the best_estimator: 0.850
-----
CPU times: user 9.95 s, sys: 0 ns, total: 9.95 s
Wall time: 9.96 s


<a id="4"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>XGBoost<center><h2>

In [120]:
%%time
parameters = {
    'max_depth': [3, 5, 7, 9], 
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1]
}

model_xgb = xgb.XGBClassifier(
    random_state=SEED,
)

model_xgb = GridSearchCV(
    model_xgb, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_xgb.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_xgb.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50}
Mean cross-validated accuracy score of the best_estimator: 0.848
-----
CPU times: user 17.4 s, sys: 211 ms, total: 17.6 s
Wall time: 17.6 s


In [121]:
%%time
parameters = {
    'max_depth': [8, 9,10,11,12], 
    'n_estimators': [40, 50, 60,70,80,90 ,100],
    'learning_rate': [0.09, 0.1,.11,.12]
}

model_xgb = xgb.XGBClassifier(
    random_state=SEED,
)

model_xgb = GridSearchCV(
    model_xgb, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_xgb.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_xgb.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_xgb.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'learning_rate': 0.12, 'max_depth': 12, 'n_estimators': 50}
Mean cross-validated accuracy score of the best_estimator: 0.852
-----
CPU times: user 1min 14s, sys: 1.24 s, total: 1min 15s
Wall time: 1min 15s


<a id="5"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>LightGBM<center><h2>

In [122]:
%%time
parameters = {
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [7, 15, 31],
}

model_lgbm = lgbm.LGBMClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_lgbm = GridSearchCV(
    model_lgbm, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_lgbm.fit(
    X_train, 
    y_train, 
    categorical_feature=categorical_columns
)

print('-----')
print(f'Best parameters {model_lgbm.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_lgbm.best_score_:.3f}'
)
print('-----')

New categorical_feature is ['Embarked', 'IsAlone', 'Parch', 'Pclass', 'Sex', 'TicketNumber', 'Title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


-----
Best parameters {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 15}
Mean cross-validated accuracy score of the best_estimator: 0.829
-----
CPU times: user 9.55 s, sys: 522 ms, total: 10.1 s
Wall time: 10.1 s


In [123]:
%%time
parameters = {
    'n_estimators': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.08, 0.09, 0.1,.11,.12,.13,.14,.15],
    'num_leaves': [7,10, 15,20,25 ,31],
}

model_lgbm = lgbm.LGBMClassifier(
    random_state=SEED,
    class_weight='balanced',
)

model_lgbm = GridSearchCV(
    model_lgbm, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_lgbm.fit(
    X_train, 
    y_train, 
    categorical_feature=categorical_columns
)

print('-----')
print(f'Best parameters {model_lgbm.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_lgbm.best_score_:.3f}'
)
print('-----')

New categorical_feature is ['Embarked', 'IsAlone', 'Parch', 'Pclass', 'Sex', 'TicketNumber', 'Title']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


-----
Best parameters {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 15}
Mean cross-validated accuracy score of the best_estimator: 0.829
-----
CPU times: user 53.4 s, sys: 2.88 s, total: 56.3 s
Wall time: 56.4 s


<a id="6"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>model linear<center><h2>

In [124]:
%%time
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV

poly = PolynomialFeatures(interaction_only=True,include_bias = False)
X_train_poly = poly.fit_transform(X_train)

from sklearn.linear_model import LogisticRegression
parameters={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression(solver = 'liblinear', max_iter=10000)


model_linear = GridSearchCV(
    logreg, 
    parameters,
    cv=5,
    scoring='accuracy'
)
model_linear.fit(X_train_poly, y_train)
print("tuned hpyerparameters :(best parameters) ",model_linear.best_params_)
print("accuracy :",model_linear.best_score_)


tuned hpyerparameters :(best parameters)  {'C': 0.001, 'penalty': 'l2'}
accuracy : 0.8226711560044894
CPU times: user 11.1 s, sys: 0 ns, total: 11.1 s
Wall time: 11.1 s


In [128]:
X_test_poly= poly.fit_transform(X_test)

catboost

In [132]:
%%time
parameters = {
    'iterations': [5, 10, 15, 20, 25, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7, 9, 11, 13],
}

model_catboost = cb.CatBoostClassifier(
    verbose=False,
)

model_catboost = GridSearchCV(
    model_catboost, 
    parameters, 
    cv=5,
    scoring='accuracy',
)

model_catboost.fit(X_train, y_train)

print('-----')
print(f'Best parameters {model_catboost.best_params_}')
print(
    f'Mean cross-validated accuracy score of the best_estimator: ' + 
    f'{model_catboost.best_score_:.3f}'
)
print('-----')

-----
Best parameters {'depth': 11, 'iterations': 100, 'learning_rate': 0.1}
Mean cross-validated accuracy score of the best_estimator: 0.842
-----
CPU times: user 4min 49s, sys: 48.1 s, total: 5min 37s
Wall time: 2min 46s


<a id="7"></a>
<h2 style='background:#7ca4cd; border:0; color:white'><center>Submission<center><h2>

In [133]:
def create_submission(model, X_test, test_passenger_id, model_name):
    y_pred_test = model.predict(X_test)
    submission = pd.DataFrame(
        {
            'PassengerId': test_passenger_id, 
            'Survived': y_pred_test,
        }
    )
    print(model_name)
    print(submission.head())
    submission.to_csv(f"submission_{model_name}.csv", index=False)

In [134]:
create_submission(model_desicion_tree, X_test, test_df['PassengerId'], 'decision_tree')
create_submission(model_random_forest, X_test, test_df['PassengerId'], 'random_forest')
create_submission(model_xgb, X_test, test_df['PassengerId'], 'xgboost')
create_submission(model_lgbm, X_test, test_df['PassengerId'], 'lightgbm')
create_submission(model_catboost, X_test, test_df['PassengerId'], 'catboost')
create_submission(model_linear, X_test_poly, test_df['PassengerId'], 'lm')

decision_tree
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
random_forest
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         0
xgboost
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
lightgbm
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
catboost
   PassengerId  Survived
0          892       0.0
1          893       1.0
2          894       0.0
3          895       0.0
4          896       1.0
lm
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1


In [135]:
tree_test_pred = model_desicion_tree.predict(X_test)
forest_test_pred = model_random_forest.predict(X_test)
xgb_test_pred = model_xgb.predict(X_test)
lgbm_test_pred = model_lgbm.predict(X_test)
cb_test_pred = model_catboost.predict(X_test)
lm_test_pred = model_catboost.predict(X_test_poly)

mean_test_pred = np.round((tree_test_pred + 
                           forest_test_pred + 
                           xgb_test_pred + 
                           lgbm_test_pred + 
                           cb_test_pred+
                          lm_test_pred) / 6)

submission = pd.DataFrame(
    { 
        'PassengerId': test_df['PassengerId'], 
        'Survived': mean_test_pred.astype(int)
    }
)
submission.to_csv("submission_mean.csv", index=False)

In [137]:
tree_train_pred = model_desicion_tree.predict(X_train)
forest_train_pred = model_random_forest.predict(X_train)
xgb_train_pred = model_xgb.predict(X_train)
lgbm_train_pred = model_lgbm.predict(X_train)
cb_train_pred = model_catboost.predict(X_train)
lm_train_pred = model_linear.predict(X_train_poly)

In [139]:
base_pred = pd.DataFrame({
    'tree':tree_train_pred.ravel(), 
    'forest':forest_train_pred.ravel(), 
    'xgb':xgb_train_pred.ravel(), 
    'lgbm':lgbm_train_pred.ravel(),
    'cb': cb_train_pred.ravel(),
    'lm': lm_train_pred.ravel()

})

test_pred = pd.DataFrame({
    'tree':tree_test_pred.ravel(), 
    'forest':forest_test_pred.ravel(), 
    'xgb':xgb_test_pred.ravel(), 
    'lgbm':lgbm_test_pred.ravel(),
    'cb': cb_test_pred.ravel(),
    'lm': lm_test_pred.ravel(),
})

In [140]:
%%time
from sklearn.svm import SVC
parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 0.5, 1,10,100,1000], 
    'gamma': [1, 0.1, 0.001, 0.0001, 'auto'],
    'degree': [3, 4, 5]
}

final_model = GridSearchCV(SVC(), parameters, cv=5).fit(base_pred, y_train)
print(accuracy_score(y_train, final_model.predict(base_pred)))
print(final_model.best_score_)
# print(accuracy_score(y_val, model_xgb.predict(X_val)))
print(final_model.best_params_)
print(final_model.best_estimator_)

0.957351290684624
0.9562289562289562
{'C': 0.1, 'degree': 3, 'gamma': 1, 'kernel': 'rbf'}
SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)
CPU times: user 14.2 s, sys: 0 ns, total: 14.2 s
Wall time: 14.2 s


In [141]:
%%time
from sklearn.svm import SVC
parameters = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [.8, 1,2,3,4,5], 
    'gamma': [1,.8,.9,1.1,1.2,1.3,1.4,1.5, 'auto'],
    'degree': [4]
}

final_model = GridSearchCV(SVC(), parameters, cv=5).fit(base_pred, y_train)
print(accuracy_score(y_train, final_model.predict(base_pred)))
print(final_model.best_score_)
# print(accuracy_score(y_val, model_xgb.predict(X_val)))
print(final_model.best_params_)
print(final_model.best_estimator_)

0.9595959595959596
0.9551066217732884
{'C': 5, 'degree': 4, 'gamma': 1, 'kernel': 'linear'}
SVC(C=5, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=4, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
CPU times: user 5.76 s, sys: 0 ns, total: 5.76 s
Wall time: 5.76 s


In [142]:
final_pred = final_model.predict(test_pred)

submission = pd.DataFrame(
    { 
        'PassengerId': test_df['PassengerId'], 
        'Survived': final_pred
    }
)
submission.to_csv("submission_final.csv", index=False)