## Attempt KNN Classification Using Dummy Variables

Part 1: Creating dummy variables.

In [135]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV

In [136]:
train = pd.read_csv("features/train_features.csv")
test = pd.read_csv("features/test_features.csv")

train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Company,Fare,Cabin,Embarked
0,1,0,3,male,22.0,Family,2.110213,M,S
1,2,1,1,female,38.0,Family,4.280593,Other,C
2,3,1,3,female,26.0,Alone,2.188856,M,S
3,4,1,1,female,35.0,Family,3.990834,Other,S
4,5,0,3,male,35.0,Alone,2.202765,M,S


In [137]:
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Company,Fare,Cabin,Embarked
0,892,3,male,34.5,Alone,2.178064,M,Q
1,893,3,female,47.0,Family,2.079442,M,S
2,894,2,male,62.0,Alone,2.369075,M,Q
3,895,3,male,27.0,Alone,2.268252,M,S
4,896,3,female,22.0,Family,2.586824,M,S


In [138]:
def createDummies(dataframe):
    return pd.get_dummies(
        dataframe, 
        drop_first = True, 
        columns = ['Pclass', 'Sex', 'Company', 'Cabin', 'Embarked']
    )

target = list(train['Survived'])
inputs = createDummies(train).drop(['Survived', 'PassengerId'], axis = 1)
inputs = pd.DataFrame(StandardScaler().fit_transform(inputs.values), columns = inputs.columns)

inputs

Unnamed: 0,Age,Fare,Pclass_2,Pclass_3,Sex_male,Company_Family,Cabin_Other,Embarked_Q,Embarked_S
0,-0.592481,-0.879741,-0.510152,0.902587,0.737695,1.231645,-0.544925,-0.307562,0.619306
1,0.638789,1.361220,-0.510152,-1.107926,-1.355574,1.231645,1.835115,-0.307562,-1.614710
2,-0.284663,-0.798540,-0.510152,0.902587,-1.355574,-0.811922,-0.544925,-0.307562,0.619306
3,0.407926,1.062038,-0.510152,-1.107926,-1.355574,1.231645,1.835115,-0.307562,0.619306
4,0.407926,-0.784179,-0.510152,0.902587,0.737695,-0.811922,-0.544925,-0.307562,0.619306
...,...,...,...,...,...,...,...,...,...
886,-0.207709,-0.333698,1.960202,-1.107926,0.737695,-0.811922,-0.544925,-0.307562,0.619306
887,-0.823344,0.487082,-0.510152,-1.107926,-1.355574,-0.811922,1.835115,-0.307562,0.619306
888,0.000000,0.242007,-0.510152,0.902587,-1.355574,1.231645,-0.544925,-0.307562,0.619306
889,-0.284663,0.487082,-0.510152,-1.107926,0.737695,-0.811922,1.835115,-0.307562,-1.614710


In [139]:
#Create a new KNeighbours Classifier
knn = KNeighborsClassifier()

#Test every value of n_neighbours between 1 and 24 using 10 fold cross-validation.
grid = {'n_neighbors': np.arange(1, 25)}
knn_tune = GridSearchCV(knn, grid, cv = 5)
knn_tune.fit(inputs, target)

#Print the best K-value and its mean score
print(knn_tune.best_params_)
knn_tune.best_score_

{'n_neighbors': 5}


0.8126043562864854

In [140]:
identification = test[['PassengerId']]
test = test.drop(['PassengerId'], axis = 1)
test_features = createDummies(test)

#Create a KNeighbours Classifier with k = 5
knn_5 = KNeighborsClassifier(n_neighbors = 5)
knn_5.fit(inputs, target)
predictions = pd.DataFrame(knn_5.predict(test_features), columns = ['Survived'])

#Arrange results in a CSV file
results = pd.concat([identification, predictions], axis = 1, join = 'inner')
results.to_csv('results/results-KNN.csv', index = False)