In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combined = [train, test]

In [3]:
#Feature Engineering - Extraction of Title in the name of passenger

for dataset in combined:
    dataset['Title'] = dataset['Name'].str.extract(' ([a-zA-z]+)\.', expand=False).str.strip()  #extracts string that ends with period
    


In [4]:
#Name is unnecessary as Title is now in the datasets
for dataset in combined:
    dataset.drop('Name', axis = 1, inplace=True)

In [5]:
#Remove Categorical Features and Useless Variables

from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
for dataset in combined:
    dataset["Sex"] = lb_make.fit_transform(dataset["Sex"])
    
train = pd.concat([train, pd.get_dummies(train['Title'], prefix = 'Title')], axis=1)
test = pd.concat([test, pd.get_dummies(test['Title'], prefix = 'Title')], axis=1)

train.drop('Title', axis = 1, inplace=True)
test.drop('Title', axis = 1, inplace=True)

train.drop('PassengerId', axis = 1, inplace=True)

train.drop('Ticket', axis = 1, inplace=True)
test.drop('Ticket', axis = 1, inplace=True)


In [6]:
#Fill in Missing Data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values= np.nan, strategy='mean')
imputer = imputer.fit(dataset['Age'].values.reshape(-1, 1))
train['Age'] = imputer.transform(train['Age'].values.reshape(-1, 1))
imputer = imputer.fit(test['Age'].values.reshape(-1, 1))
test['Age'] = imputer.transform(test['Age'].values.reshape(-1, 1))

imputer = imputer.fit(train['Fare'].values.reshape(-1, 1))
train['Fare'] = imputer.transform(train['Fare'].values.reshape(-1, 1))
train['Embarked'].fillna('S', inplace = True)

imputer = imputer.fit(test['Fare'].values.reshape(-1, 1))
test['Fare'] = imputer.transform(test['Fare'].values.reshape(-1, 1))
test['Embarked'].fillna('S', inplace = True)

In [7]:
#Cabin Enconding
train['Cabin'] = train['Cabin'].str[:1]
test['Cabin'] = test['Cabin'].str[:1]
train.Cabin.fillna('U', inplace=True)
test.Cabin.fillna('U', inplace=True)

In [8]:
train = pd.concat([train, pd.get_dummies(train['Cabin'], prefix = 'Cabin')], axis=1)
test = pd.concat([test, pd.get_dummies(test['Cabin'], prefix = 'Cabin')], axis=1)
train.drop('Cabin', axis = 1, inplace=True)
test.drop('Cabin', axis = 1, inplace=True)

In [9]:
#Embarked Encoding
train = pd.concat([train, pd.get_dummies(train['Embarked'], prefix = 'Embarked')], axis=1)
test = pd.concat([test, pd.get_dummies(test['Embarked'], prefix = 'Embarked')], axis=1)
train.drop('Embarked', axis = 1, inplace=True)
test.drop('Embarked', axis = 1, inplace=True)

In [10]:
train["FamilySize"] = train["SibSp"] + train["Parch"] + 1
test["FamilySize"] = test["SibSp"] + test["Parch"] + 1

In [11]:
#Remove these variables as they have been used to create family size 
train.drop('SibSp', axis = 1, inplace=True)
test.drop('SibSp', axis = 1, inplace=True)
train.drop('Parch', axis = 1, inplace=True)
test.drop('Parch', axis = 1, inplace=True)



In [12]:
#Remove Survived, as it is the target
train_data = train.drop('Survived', axis=1)
target = train['Survived']


In [13]:
#Remove PassengerID from test data set, must keep this in separate variable so we still have access to PassengerID for submission
test_data = test.drop("PassengerId", axis=1)

In [15]:
#Remove variables so test_data and train_data have the same number of variables
train_data.drop('Title_Capt', axis=1,inplace = True)
train_data.drop('Title_Countess', axis=1, inplace = True)
train_data.drop('Title_Lady', axis=1, inplace = True)
train_data.drop('Title_Jonkheer', axis=1, inplace = True)
train_data.drop('Title_Mme', axis=1, inplace = True)
train_data.drop('Title_Mlle', axis=1, inplace = True)
train_data.drop('Title_Sir', axis=1, inplace = True)
train_data.drop('Title_Don', axis=1,inplace = True)
train_data.drop('Cabin_T', axis=1,inplace = True)


In [18]:
#KNN
clf = KNeighborsClassifier(n_neighbors = 13)
clf.fit(train_data, target)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                     weights='uniform')

In [19]:
#Prediction
prediction = clf.predict(test_data)

In [20]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": prediction })

submission.to_csv('predictions.csv', index=False)

In [21]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 25 columns):
Pclass          418 non-null int64
Sex             418 non-null int32
Age             418 non-null float64
Fare            418 non-null float64
Title_Col       418 non-null uint8
Title_Dona      418 non-null uint8
Title_Dr        418 non-null uint8
Title_Master    418 non-null uint8
Title_Miss      418 non-null uint8
Title_Mr        418 non-null uint8
Title_Mrs       418 non-null uint8
Title_Ms        418 non-null uint8
Title_Rev       418 non-null uint8
Cabin_A         418 non-null uint8
Cabin_B         418 non-null uint8
Cabin_C         418 non-null uint8
Cabin_D         418 non-null uint8
Cabin_E         418 non-null uint8
Cabin_F         418 non-null uint8
Cabin_G         418 non-null uint8
Cabin_U         418 non-null uint8
Embarked_C      418 non-null uint8
Embarked_Q      418 non-null uint8
Embarked_S      418 non-null uint8
FamilySize      418 non-null int64
dtypes: float64(