In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#reading data

train_data = pd.read_csv("train.csv")

In [3]:
#selecting relevant features
#encoding categorical data
#replacing missing data with mean


X = train_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
X = X.replace({"male": 0.5, "female": -0.5})

X = X.fillna(X.mean())

In [4]:
#estimating the number of children and distributing them among passengers above the age of 23

children = (len(X)-(X["Parch"].value_counts()[0]))/(len(X[X["Age"]>=23]))
print(children)

0.32272727272727275


In [5]:
#estimating help received from and help provided to siblings/spouses, parents, and children, based on age

def h_r(a, s, p):
    if a<20:
        return (p+0.5*s)/(1+0.5*s)
    elif 20<=a<45:
        if s!=0:
            return s/(s+p)
        else:
            return 0
    else:
        return children
def h_p(a, s, p):
    if a<12 or a>45:
        return 0
    elif 12<=a<20:
        return (0.5*s)/(1+p+0.5*s)
    elif 20<=a<23:
        return (p+s)/(1+s)
    elif 23<=a<45:
        return (p+s+children)/(1+s)
    else:
        return 0
for i in range(len(X)):
    hr = h_r(X.loc[i, "Age"], X.loc[i, "SibSp"], X.loc[i, "Parch"])
    hp = h_p(X.loc[i, "Age"], X.loc[i, "SibSp"], X.loc[i, "Parch"])
    X.loc[i, ["Help received"]] = hr
    X.loc[i, ["Help provided"]] = hp
    X.loc[i, ["Net help received"]] = hr-hp

In [6]:
#replacing age with more relevant information
#feature normalization

X["Age"] = abs(X["Age"]-35)

def featnorm(s, M):
    return (M[s]-M[s].mean())/(M[s].max()-M[s].min())
X["Pclass"] = featnorm("Pclass", X)
X["Age"] = featnorm("Age", X)
X["Fare"] = featnorm("Fare", X)
X["Net help received"] = featnorm("Net help received", X)
X["SibSp"] = featnorm("SibSp", X)
X["Parch"] = featnorm("Parch", X)

In [7]:
#creating the model

Xlist = list(zip(X["Pclass"], X["Age"], X["Fare"], X["Sex"], X["Net help received"], X["SibSp"], X["Parch"]))
ylist = list(train_data["Survived"])
model = KNeighborsClassifier(n_neighbors=int(np.sqrt(len(X))))
model.fit(Xlist, ylist)

In [8]:
#applying exactly the same modifications, feature creation, and rescaling process to the test data

test_data = pd.read_csv("test.csv")

A = test_data[["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]]
A = A.replace({"male": 0.5, "female": -0.5})

A = A.fillna(A.mean())

children = (len(A)-(A["Parch"].value_counts()[0]))/(len(A[A["Age"]>=23]))
print(children)

for i in range(len(A)):
    hr = h_r(A.loc[i, "Age"], A.loc[i, "SibSp"], A.loc[i, "Parch"])
    hp = h_p(A.loc[i, "Age"], A.loc[i, "SibSp"], A.loc[i, "Parch"])
    A.loc[i, ["Help received"]] = hr
    A.loc[i, ["Help provided"]] = hp
    A.loc[i, ["Net help received"]] = hr-hp
    
A["Age"] = abs(A["Age"]-35)

A["Pclass"] = featnorm("Pclass", A)
A["Age"] = featnorm("Age", A)
A["Fare"] = featnorm("Fare", A)
A["Net help received"] = featnorm("Net help received", A)
A["SibSp"] = featnorm("SibSp", A)
A["Parch"] = featnorm("Parch", A)

0.2984126984126984


In [9]:
#applying the model to the test data

test_list = list(zip(A["Pclass"], A["Age"], A["Fare"], A["Sex"], A["Net help received"], A["SibSp"], A["Parch"]))

prediction = model.predict(test_list)
final = pd.DataFrame()
final["PassengerId"] = range(892, 1310)
for i in range(len(prediction)):
    final.loc[i, ["Survived"]] = prediction[i]

In [10]:
#checks

l = len(prediction)
s = sum(prediction)
p = s/l

print("Total number:", l)
print("Survived:", s)
print("Survival rate:", p)

Total number: 418
Survived: 137
Survival rate: 0.3277511961722488


In [11]:
#transferring predictions to csv file

final.to_csv("titanic_predictions.csv", sep=",", index=False)

In [12]:
#saving the model

import pickle
with open("Titanic prediction model", "wb") as f:
    pickle.dump(model, f)

In [13]:
#comparing performance to a random model (extra)

L = len(train_data)
S = sum(train_data["Survived"])
P = L/S

import random
state = [0, 1]
randlist = random.choices(state, weights=(1-P, P), k=len(test_data))

base = pd.DataFrame()
base["PassengerId"] = range(892, 1310)
for i in range(len(randlist)):
    base.loc[i, ["Survived"]] = randlist[i]
base.to_csv("baseline_predictions.csv", sep=",", index=False)

Final Results

Accuracy of random model: 53.349%
Accuracy of KNN model: 76.315%