In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")

import sklearn.base as skbase
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

## Index

### load train data

In [57]:
train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
ytrain = train_df["Survived"]

feature_names = ["Pclass","Age","SibSp","Parch","Fare","IsMale","Ticket-4digit","Ticket-5digit","Ticket-6digit"]
Xtrain = train_df[feature_names]

In [58]:
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)

### cross-validation to find the best number of neighbors

In [59]:
param_dist = {"n_neighbors":  np.arange(2,11)}
knn = KNeighborsClassifier()
searchcv = GridSearchCV(estimator=knn, param_grid=param_dist,cv=10,verbose=1)

In [60]:
searchcv.fit(Xtrain_scaled,ytrain)    

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:    0.1s


Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    0.2s finished


GridSearchCV(cv=10, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=5, p=2, weights='uniform'),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_neighbors': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=1)

In [61]:
best_neighbors = searchcv.best_params_["n_neighbors"]
best_neighbors

3

In [62]:
searchcv.best_score_

0.8058361391694725

### train again with whole data

In [63]:
knn = skbase.clone(searchcv.best_estimator_)
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=3, p=2, weights='uniform')

In [64]:
knn.fit(Xtrain_scaled,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=3, p=2, weights='uniform')

In [65]:
knn.score(Xtrain_scaled,ytrain)

0.88552188552188549

### save the result

In [66]:
import common
common.dump_predictor("knn.pkl",knn)

In [67]:
test_df = pd.read_csv("test_processed.csv",index_col="PassengerId")
Xtest_scaled = scaler.transform(test_df[feature_names])

predictions = knn.predict(Xtest_scaled)
common.make_submission(Xtest.index,predictions,"submit_knn.csv")