In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette='pastel')

In [2]:
df = pd.read_csv('data/train.csv')
rename_map = {column: column.lower() for column in df.columns}
df.rename(rename_map, axis=1, inplace=True)
df['family_size'] = df['sibsp'] + df['parch'] + 1

In [3]:
X = df.drop('survived', axis=1)
y = df['survived']

# Prepare data

In [4]:
X['sex'].replace({'female': 0, 'male': 1}, inplace=True)
X = pd.concat([X, pd.get_dummies(X['embarked'], prefix='embarked')], axis=1, sort=False)
X.drop(['name', 'cabin', 'ticket', 'embarked', 'passengerid'], axis=1, inplace=True)

# fill all NaN cells
X['age'] = X['age'].fillna(X['age'].mean())

# normalize
for col in X.columns:
    X[col] = X[col] / X[col].max()

In [5]:
X.isnull().sum()

pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
family_size    0
embarked_C     0
embarked_Q     0
embarked_S     0
dtype: int64

In [6]:
X.describe()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,family_size,embarked_C,embarked_Q,embarked_S
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.769547,0.647587,0.371239,0.065376,0.063599,0.062858,0.173146,0.188552,0.08642,0.722783
std,0.27869,0.47799,0.162525,0.137843,0.134343,0.096995,0.146678,0.391372,0.281141,0.447876
min,0.333333,0.0,0.00525,0.0,0.0,0.0,0.090909,0.0,0.0,0.0
25%,0.666667,0.0,0.275,0.0,0.0,0.01544,0.090909,0.0,0.0,0.0
50%,1.0,1.0,0.371239,0.0,0.0,0.028213,0.090909,0.0,0.0,1.0
75%,1.0,1.0,0.4375,0.125,0.0,0.060508,0.181818,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# KNeighborsClassifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.3)

In [9]:
knn = KNeighborsClassifier(n_neighbors=10)

In [10]:
%%time
knn.fit(X_train, y_train)

CPU times: user 1.77 ms, sys: 464 µs, total: 2.24 ms
Wall time: 2.33 ms


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=10, p=2,
           weights='uniform')

In [11]:
knn.score(X_test, y_test)

0.8171641791044776

# Use cross validation

In [12]:
from sklearn.model_selection import cross_validate

In [15]:
%%time
cv_results = cross_validate(knn, X, y, cv=5, return_train_score=True)

CPU times: user 246 ms, sys: 0 ns, total: 246 ms
Wall time: 246 ms


In [18]:
# best score after cross validation
cv_results['test_score'].max()

0.8370786516853933

# Search best hyper-parameters

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
knn_params = {
    'n_neighbors': range(1, 20)
}
knn_grid = GridSearchCV()

TypeError: __init__() missing 2 required positional arguments: 'estimator' and 'param_grid'

In [None]:
range(1, 20) + range(20, )