In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette='pastel')

In [2]:
df = pd.read_csv('data/train.csv')
rename_map = {column: column.lower() for column in df.columns}
df.rename(rename_map, axis=1, inplace=True)
df['family_size'] = df['sibsp'] + df['parch'] + 1

In [3]:
X = df.drop('survived', axis=1)
y = df['survived']

# Prepare data

In [4]:
X['sex'].replace({'female': 0, 'male': 1}, inplace=True)
X = pd.concat([X, pd.get_dummies(X['embarked'], prefix='embarked')], axis=1, sort=False)
X.drop(['name', 'cabin', 'ticket', 'embarked', 'passengerid'], axis=1, inplace=True)

# fill all NaN cells
X['age'] = X['age'].fillna(X['age'].mean())

# normalize
for col in X.columns:
    X[col] = X[col] / X[col].max()

In [5]:
X.isnull().sum()

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
family_size    0
embarked_C     0
embarked_Q     0
embarked_S     0
dtype: int64

In [6]:
X.describe()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,family_size,embarked_C,embarked_Q,embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.769547,0.647587,0.371239,0.065376,0.063599,0.062858,0.173146,0.188552,0.08642,0.722783
std,0.27869,0.47799,0.162525,0.137843,0.134343,0.096995,0.146678,0.391372,0.281141,0.447876
min,0.333333,0.0,0.00525,0.0,0.0,0.0,0.090909,0.0,0.0,0.0
25%,0.666667,0.0,0.275,0.0,0.0,0.01544,0.090909,0.0,0.0,0.0
50%,1.0,1.0,0.371239,0.0,0.0,0.028213,0.090909,0.0,0.0,1.0
75%,1.0,1.0,0.4375,0.125,0.0,0.060508,0.181818,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
X.shape, y.shape

((891, 10), (891,))

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.3)

# KNeighborsClassifier

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

In [29]:
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.8022388059701493

# DecisionTreeClassifier

In [12]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=3)

In [13]:
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

0.8171641791044776

# Use cross validation

In [31]:
from sklearn.model_selection import cross_validate
def get_best_score_by_cv(estimator, X_train, y_train, X_test, y_test):
    cv_results = cross_validate(estimator, X_train, y_train, cv=5, return_train_score=True, return_estimator=True)
    best_try = cv_results['test_score'].argmax()
    best_estimator = cv_results['estimator'][best_try]
    return best_estimator.score(X_test, y_test)

In [32]:
# KNN
get_best_score_by_cv(knn, X_train, y_train, X_test, y_test)

0.8059701492537313

In [33]:
# tree
get_best_score_by_cv(tree, X_train, y_train, X_test, y_test)

0.8134328358208955

# Search best hyper-parameters
## KNN

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
knn_params = {
    'n_neighbors': range(1, 30)
}
knn_grid = GridSearchCV(knn, param_grid=knn_params, cv=5, n_jobs=-1)

In [19]:
%%time
knn_grid.fit(X_train, y_train)

CPU times: user 298 ms, sys: 54.1 ms, total: 352 ms
Wall time: 3.91 s


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_neighbors': range(1, 30)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=0)

In [20]:
{
    'cross_validation_score': knn_grid.best_score_,
    'params': knn_grid.best_params_,
    'test_score': knn_grid.score(X_test, y_test)
}

{'cross_validation_score': 0.812199036918138,
 'params': {'n_neighbors': 8},
 'test_score': 0.8208955223880597}

# Tree

In [21]:
tree_params = {'max_depth': range(1, 20),
               'max_features': range(2, 10)}
tree_grid = GridSearchCV(tree, param_grid=tree_params, cv=5, n_jobs=-1)

In [22]:
%%time
tree_grid.fit(X_train, y_train)

CPU times: user 561 ms, sys: 8.15 ms, total: 569 ms
Wall time: 1.06 s




GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(1, 20), 'max_features': range(2, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [23]:
{
    'cross_validation_score': tree_grid.best_score_,
    'params': tree_grid.best_params_,
    'test_score': tree_grid.score(X_test, y_test)
}

{'cross_validation_score': 0.8234349919743178,
 'params': {'max_depth': 4, 'max_features': 6},
 'test_score': 0.8134328358208955}

# Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=100, n_jobs=-1)
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.8134328358208955

In [25]:
forest_cv = cross_validate(forest, X_train, y_train, cv=5, return_train_score=True, return_estimator=True)
best_try = forest_cv['test_score'].argmax()
best_estimator = forest_cv['estimator'][best_try]
forest_cv['test_score'].max(), best_estimator.score(X_test, y_test)

(0.832, 0.8097014925373134)

In [26]:
forest_params = {'max_depth': range(1, 20),
                 'max_features': range(2, 10)}
forest_grid = GridSearchCV(forest, param_grid=forest_params, cv=5, n_jobs=-1)
forest_grid.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': range(1, 20), 'max_features': range(2, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
forest_grid.score(X_test, y_test), forest_grid.best_params_

(0.8283582089552238, {'max_depth': 7, 'max_features': 6})