# Title
We are attempting to solve this kaggle challenge: https://www.kaggle.com/competitions/titanic/data

### Imports and converting file to dataframe

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

In [34]:

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

count = 0
for i in train["Survived"]:
    if i==1:
        count +=1 

print("Survival rate:", count/len(train["Survived"]))
print("The regular success rate should work fine")

le = len(train)
train_dropna = train.dropna()
print("data length", le, "-->", len(train_dropna), "if we dropna. Not a good idea.")
print()
print(train.head(5))
print(train.shape)
print(test.shape)

Survival rate: 0.3838383838383838
The regular success rate should work fine
data length 891 --> 183 if we dropna. Not a good idea.

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   

### Cleaning the data 
annoying!! (sadface)

I referred to https://www.kaggle.com/code/murtadhanajim/80-in-titanic-dataset-using-random-forests/notebook and used the mean / median as replacements for NaN

In [43]:
def process_data(df):
    df = df.drop(columns=["PassengerId", "Name", "Ticket"]) 
    # ^^ dropped because i don't know how to deal with these...
    # there's probably a way to get info from the ticket number though?

    #replace missing values
    replacements = {"Pclass":df['Pclass'].median(),
                    "Sex": "n",
                    "Age": df['Age'].median(),
                    "SibSp": df['SibSp'].mode()[0],
                    "Parch": df['Parch'].mode()[0],
                    "Fare": df['Fare'].mean(),
                    "Cabin":"unknown",
                    "Embarked":"unknown",
                    }
    
    df = df.fillna(value=replacements)  #this doesn't modify inplace!

    # Convert cabin to first letter 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    df['Sex'] = df['Sex'].map(lambda x: x[0])

    # one-hot-encoding
    df = pd.get_dummies(df, columns=['Sex', 'Cabin', "Embarked"])

    return df

train1 = process_data(train)
test1 = process_data(test)
labels = train1["Survived"]
train1 = train1.drop(columns = ["Survived"]) #This is y

print(train1.head())
# print(test1.head(3))
# print(labels.head())
print(train1.shape)
print(test1.shape)
print(labels.shape)

   Pclass   Age  SibSp  Parch     Fare  Sex_f  Sex_m  Cabin_A  Cabin_B  \
0       3  22.0      1      0   7.2500  False   True    False    False   
1       1  38.0      1      0  71.2833   True  False    False    False   
2       3  26.0      0      0   7.9250   True  False    False    False   
3       1  35.0      1      0  53.1000   True  False    False    False   
4       3  35.0      0      0   8.0500  False   True    False    False   

   Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_u  Embarked_C  \
0    False    False    False    False    False    False     True       False   
1     True    False    False    False    False    False    False        True   
2    False    False    False    False    False    False     True       False   
3     True    False    False    False    False    False    False       False   
4    False    False    False    False    False    False     True       False   

   Embarked_Q  Embarked_S  Embarked_unknown  
0       False        True   

**Scaling**

In [46]:

from sklearn.preprocessing import StandardScaler
scalar_age = StandardScaler() 
scalar_fare = StandardScaler() #need two seperate scalars!
train2 = train1
test2 = test1
train2["Age"] = scalar_age.fit_transform(train1[["Age"]])
train2["Fare"] = scalar_fare.fit_transform(train1[["Fare"]])
test2["Age"] = scalar_age.transform(test1[["Age"]])
test2["Fare"] = scalar_fare.transform(test1[["Fare"]])
# print(train2, test2)
# print(train2["Age"].shape)

In [51]:
y = labels
X = train2
Z = test2
print(X.head(1))
print(y.head(5))
print(Z.head(1))

   Pclass       Age  SibSp  Parch      Fare  Sex_f  Sex_m  Cabin_A  Cabin_B  \
0       3 -0.565736      1      0 -0.502445  False   True    False    False   

   Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_u  Embarked_C  \
0    False    False    False    False    False    False     True       False   

   Embarked_Q  Embarked_S  Embarked_unknown  
0       False        True             False  
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64
   Pclass       Age  SibSp  Parch      Fare  Sex_f  Sex_m  Cabin_A  Cabin_B  \
0       3  0.394887      0      0 -0.490783  False   True    False    False   

   Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_u  Embarked_C  \
0    False    False    False    False    False     True       False   

   Embarked_Q  Embarked_S  
0        True       False  


## Fitting to model

### KNN

In [66]:
knn = KNeighborsRegressor()
knn_mod = GridSearchCV(estimator = knn,
             param_grid = {'n_neighbors': list(range(1, 20))})
knn_mod.fit(X, y)
results = pd.DataFrame(knn_mod.cv_results_)

In [71]:
# results[["param_n_neighbors","rank_test_score"]]
# best results when k=8
pred = knn_mod.predict(X)
pred = [int(x) for x in pred]
print(pred, y.head())

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [75]:
#Accuracy
correct = 0
total = len(pred)
for i in range(total-1):
    if round(pred[i]) == y[i]:
        correct += 1
print("The percentage accuracy is", f'{correct*100/total:.2f}')

The percentage accuracy is 68.80
