In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
datafile='titanic_train.csv'
titanic=pd.read_csv(datafile)
titanic[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [3]:
#function to format the data, given a list of predictors we want to use

def format_data(datafile,predictors):
    #read in the file
    titanic=pd.read_csv(datafile)
    #change 'male' and 'female' in the 'Sex' column to 0 and 1 respectively
    titanic.loc[titanic['Sex']=='male','Sex']=0
    titanic.loc[titanic['Sex']=='female','Sex']=1
    #creates columns 'Emb1' through 'Emb3' which will contain the point the passenger embarked at.  I.e. a passenger 
    #who embarks at 'C' will have a 1 in 'Emb1' column and 0 in 'Emb2' and 'Emb3', passengers who embarked at 'S' will
    #have a 1 in 'Emb2', and so on.
    titanic['Emb1']=0
    titanic['Emb2']=0
    titanic['Emb3']=0
    titanic.loc[titanic['Embarked']=='C','Emb1']=1
    titanic.loc[titanic['Embarked']=='S','Emb2']=1
    titanic.loc[titanic['Embarked']=='Q','Emb3']=1
    #dropping the original 'Embarked' column
    titanic=titanic.drop('Embarked',axis=1)
    #drop any columns whose name is not in the list 'predictors'
    for val in titanic.columns.values:
        if not(val in predictors+['Survived']):
            titanic=titanic.drop(val,axis=1)
    #in the columns from the list 'predictors', replace and NAN values with the median of the values in that column
    for val in predictors:
        titanic[val] = titanic[val].fillna(titanic[val].median())
    return titanic

In [4]:
predictors=["Pclass", "Sex", "Age", "SibSp", "Parch"]

In [5]:
titanic_training=format_data('titanic_train.csv',predictors)

In [6]:
titanic_training[:5]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,0,22,1,0
1,1,1,1,38,1,0
2,1,3,1,26,0,0
3,1,1,1,35,1,0
4,0,3,0,35,0,0


In [7]:
survival=titanic_training['Survived']
survival_array=survival.values
features=titanic_training.drop('Survived', 1)
features_array=features.values

In [8]:
#computing the means of each feature, so that we can subtract them off of the feature values which creates new variables with mean 0

features_averages=np.mean(features_array,axis=0, dtype=np.float64)
features_averages

array([  2.30864198,   0.35241302,  29.36158249,   0.52300786,   0.38159371])

In [9]:
for i in range(len(features_averages)):
    features_array[:,i]-=features_averages[i]
features_array

array([[ 0.69135802, -0.35241302, -7.36158249,  0.47699214, -0.38159371],
       [-1.30864198,  0.64758698,  8.63841751,  0.47699214, -0.38159371],
       [ 0.69135802,  0.64758698, -3.36158249, -0.52300786, -0.38159371],
       ..., 
       [ 0.69135802,  0.64758698, -1.36158249,  0.47699214,  1.61840629],
       [-1.30864198, -0.35241302, -3.36158249, -0.52300786, -0.38159371],
       [ 0.69135802, -0.35241302,  2.63841751, -0.52300786, -0.38159371]])

In [31]:
np.mean(features_array,axis=0, dtype=np.int)

array([0, 0, 0, 0, 0])

In [32]:
features_array.shape

(891, 5)

In [33]:
#computing the covariance of each of the variables, so that we can scale each variable so that it has standar deviation 1

covariance_matrix=1/890*features_array.T@features_array
covariance_matrix

array([[  6.99015120e-01,  -5.27118879e-02,  -3.69992717e+00,
          7.65986961e-02,   1.24289083e-02],
       [ -5.27118879e-02,   2.28474508e-01,  -5.05097643e-01,
          6.04219473e-02,   9.45837905e-02],
       [ -3.69992717e+00,  -5.05097643e-01,   1.69512498e+02,
         -3.34952521e+00,  -1.81013264e+00],
       [  7.65986961e-02,   6.04219473e-02,  -3.34952521e+00,
          1.21604308e+00,   3.68738572e-01],
       [  1.24289083e-02,   9.45837905e-02,  -1.81013264e+00,
          3.68738572e-01,   6.49728244e-01]])

In [13]:
norm_features_array=np.copy(features_array)
for i in range(len(features_array.T)):
    norm_features_array[:,i]=norm_features_array[:,i]/np.sqrt(covariance_matrix[i,i])

In [14]:
norm_covariance_matrix=1/890*norm_features_array.T@norm_features_array
norm_covariance_matrix

array([[ 1.        , -0.13190049, -0.33989833,  0.08308136,  0.01844267],
       [-0.13190049,  1.        , -0.08116254,  0.11463081,  0.24548896],
       [-0.33989833, -0.08116254,  1.        , -0.23329633, -0.17248195],
       [ 0.08308136,  0.11463081, -0.23329633,  1.        ,  0.4148377 ],
       [ 0.01844267,  0.24548896, -0.17248195,  0.4148377 ,  1.        ]])

In [15]:
np.shape(norm_features_array)

(891, 5)

In [16]:
train_data_array=np.zeros((891,6))

In [17]:
train_data_array[:,1:]=norm_features_array

In [18]:
train_data_array[:,0]=survival_array

In [19]:
train_data_array

array([[ 0.        ,  0.82691282, -0.73728105, -0.5654189 ,  0.43255043,
        -0.47340772],
       [ 1.        , -1.56522783,  1.35481262,  0.66348839,  0.43255043,
        -0.47340772],
       [ 1.        ,  0.82691282,  1.35481262, -0.25819208, -0.47427882,
        -0.47340772],
       ..., 
       [ 0.        ,  0.82691282,  1.35481262, -0.10457867,  0.43255043,
         2.0078057 ],
       [ 1.        , -1.56522783, -0.73728105, -0.25819208, -0.47427882,
        -0.47340772],
       [ 0.        ,  0.82691282, -0.73728105,  0.20264816, -0.47427882,
        -0.47340772]])

In [20]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(norm_features_array,survival_array) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [21]:
titanic_testing=format_data('titanic_test.csv',predictors)

In [22]:
predictions=knn.predict(titanic_testing)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [23]:
titanic_test_data=format_data('titanic_test.csv',predictors+['PassengerId'])

In [24]:
formatted_titanic_test=pd.DataFrame({'PassengerId':titanic_test_data['PassengerId'],'Survived':predictions})

In [25]:
formatted_titanic_test.to_csv('norm_submission.csv',index=False)

In [26]:
full_covariance_matrix=1/890*train_data_array.T@train_data_array
full_covariance_matrix[0,:]

array([ 0.38426966, -0.16470232,  0.26439068, -0.03158492, -0.01718766,
        0.03972025])

In [27]:
scaled_features_array=np.copy(norm_features_array)
for i in range(len(scaled_features_array.T)):
    scaled_features_array[:,i]=scaled_features_array[:,i]/np.sqrt(np.absolute(full_covariance_matrix[0,i+1]))

In [28]:
knn2 = KNeighborsClassifier()
knn2.fit(scaled_features_array,survival_array) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [29]:
predictions2=knn2.predict(titanic_testing)
predictions2

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0,

In [30]:
formatted_titanic_test2=pd.DataFrame({'PassengerId':titanic_test_data['PassengerId'],'Survived':predictions2})

In [66]:
formatted_titanic_test2.to_csv('scaled_submission.csv',index=False)