# Title
We are attempting to solve this kaggle challenge: https://www.kaggle.com/competitions/titanic/data

### Imports and converting file to dataframe

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

In [3]:

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

count = 0
for i in train["Survived"]:
    if i==1:
        count +=1 

print("Survival rate:", count/len(train["Survived"]))
print("The regular success rate should work fine")

le = len(train)
train_dropna = train.dropna()
print("data length", le, "-->", len(train_dropna), "if we dropna. Not a good idea.")
print()
print(train.head(5))
print(train.shape)
print(test.shape)

Survival rate: 0.3838383838383838
The regular success rate should work fine
data length 891 --> 183 if we dropna. Not a good idea.

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   

### Cleaning the data 
annoying!! (sadface)

I referred to https://www.kaggle.com/code/murtadhanajim/80-in-titanic-dataset-using-random-forests/notebook and used the mean / median as replacements for NaN

In [4]:
def process_data(df):
    df = df.drop(columns=["PassengerId", "Name", "Ticket"]) 
    # ^^ dropped because i don't know how to deal with these...
    # there's probably a way to get info from the ticket number though?

    #replace missing values
    replacements = {"Pclass":df['Pclass'].median(),
                    "Sex": "n",
                    "Age": df['Age'].median(),
                    "SibSp": df['SibSp'].mode()[0],
                    "Parch": df['Parch'].mode()[0],
                    "Fare": df['Fare'].mean(),
                    "Cabin":"unknown",
                    "Embarked":"unknown",
                    }
    
    df = df.fillna(value=replacements)  #this doesn't modify inplace!

    # Convert cabin to first letter 
    df['Cabin'] = df['Cabin'].map(lambda x: str(x)[0])
    df['Sex'] = df['Sex'].map(lambda x: x[0])

    # one-hot-encoding
    df = pd.get_dummies(df, columns=['Sex', 'Cabin', "Embarked"])

    return df

train1 = process_data(train)
test1 = process_data(test)
labels = train1["Survived"]
train1 = train1.drop(columns = ["Survived"]) #This is y

print(train1.head())
print(test1.head())
# print(labels.head())
print(train1.shape)
print(test1.shape)
print(labels.shape)

   Pclass   Age  SibSp  Parch     Fare  Sex_f  Sex_m  Cabin_A  Cabin_B  \
0       3  22.0      1      0   7.2500  False   True    False    False   
1       1  38.0      1      0  71.2833   True  False    False    False   
2       3  26.0      0      0   7.9250   True  False    False    False   
3       1  35.0      1      0  53.1000   True  False    False    False   
4       3  35.0      0      0   8.0500  False   True    False    False   

   Cabin_C  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  Cabin_u  Embarked_C  \
0    False    False    False    False    False    False     True       False   
1     True    False    False    False    False    False    False        True   
2    False    False    False    False    False    False     True       False   
3     True    False    False    False    False    False    False       False   
4    False    False    False    False    False    False     True       False   

   Embarked_Q  Embarked_S  Embarked_unknown  
0       False        True   

In [5]:
# oops the test dataset is missing columns... Let's add empty columns
length = len(test1)

Cabin_T = [False for i in range(length)]
test1.insert(train1.columns.get_loc("Cabin_T"), "Cabin_T", Cabin_T)

Embarked_unknown = [False for i in range(length)]
test1["Embarked_unknown"]=Embarked_unknown

print(train1.columns, train1.shape)
print(test1.columns, test1.shape)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_f', 'Sex_m', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_u', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_unknown'],
      dtype='object') (891, 20)
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_f', 'Sex_m', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_u', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_unknown'],
      dtype='object') (418, 20)


**Scaling**

In [6]:

from sklearn.preprocessing import StandardScaler
scalar_age = StandardScaler() 
scalar_fare = StandardScaler() #need two seperate scalars!

train2 = train1
test2 = test1

train2["Age"] = scalar_age.fit_transform(train1[["Age"]])
train2["Fare"] = scalar_fare.fit_transform(train1[["Fare"]])

test2["Age"] = scalar_age.transform(test1[["Age"]])
test2["Fare"] = scalar_fare.transform(test1[["Fare"]])
# print(train2, test2)
# print(train2["Age"].shape)

In [7]:
y = labels
X = train2
Z = test2
print(X.columns)
print(y.head(5))
print(Z.columns)
print(X.shape)
print(Z.shape)

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_f', 'Sex_m', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_u', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_unknown'],
      dtype='object')
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64
Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_f', 'Sex_m', 'Cabin_A',
       'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G',
       'Cabin_T', 'Cabin_u', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Embarked_unknown'],
      dtype='object')
(891, 20)
(418, 20)


## Fitting to model

### KNN

In [8]:
knn = KNeighborsRegressor()
knn_mod = GridSearchCV(estimator = knn,
             param_grid = {'n_neighbors': list(range(1, 20))})
knn_mod.fit(X, y)
results = pd.DataFrame(knn_mod.cv_results_)

In [9]:
# results[["param_n_neighbors","rank_test_score"]]
# best results when k=8
pred_knn = knn_mod.predict(X)
pred_knn = [int(x) for x in pred_knn]
print(pred_knn, y.head())

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 

In [10]:
#Accuracy
# correct = 0
# total = len(X)
# for i in range(total-1):
#     if round(pred_knn[i]) == y[i]:
#         correct += 1
# print("The percentage accuracy is", f'{correct*100/total:.2f}')

from sklearn.metrics import accuracy_score
print("The percentage accuracy is", f'{100* accuracy_score(pred_knn, y):.2f}')


The percentage accuracy is 68.91


### Neural Network

In [11]:
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(solver='lbfgs', alpha=1,
#                     hidden_layer_sizes=(10), random_state=1, max_iter=1500)

In [12]:
clf = MLPClassifier(solver='lbfgs', alpha=1, random_state=1, max_iter=1500)
NN_mod_one_layer = GridSearchCV(estimator = clf,
             param_grid = {'hidden_layer_sizes': (range(5, 7))})  # Best is 6. (I'm decreasing the range here to reduce running time)
NN_mod_one_layer.fit(X, y)


In [None]:
def iterate_layers(min_i, max_i, min_j, max_j):
    for i in range(min_i, max_i):
        for j in range(min_j, max_j): 
            yield (i,j)

NN_mod_two_layer = GridSearchCV(estimator = clf,
             param_grid = {'hidden_layer_sizes': list(iterate_layers(7,10,2,4))}) #again, tweaked to decrease running time
NN_mod_two_layer.fit(X, y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [14]:
# print(pd.DataFrame(NN_mod_one_layer.cv_results_))
print(pd.DataFrame(NN_mod_two_layer.cv_results_)[["param_hidden_layer_sizes", "rank_test_score"]])

  param_hidden_layer_sizes  rank_test_score
0                   (7, 2)                6
1                   (7, 3)                5
2                   (8, 2)                3
3                   (8, 3)                1
4                   (9, 2)                4
5                   (9, 3)                2


In [15]:
from sklearn.metrics import accuracy_score

pred_NN_mod = NN_mod_one_layer.predict(X)
print("The best percentage accuracy is", f'{100* accuracy_score(pred_NN_mod, y):.2f}', "one layer, with hidden_layer = 6")

pred_NN_mod = NN_mod_two_layer.predict(X)
print("The best percentage accuracy is", f'{100* accuracy_score(pred_NN_mod, y):.2f}', "two layers, with hidden_layer = (8, 3)")

The best percentage accuracy is 85.97 one layer, with hidden_layer = 6
The best percentage accuracy is 86.98 two layers, with hidden_layer = (8, 3)


## Decision Tree

In [16]:
#code

## Final Prediction

For now the best model is neural network with (8,3), so I'm submitting this to kaggle. We might have better results with other models tho?

In [20]:
pred_final = NN_mod_two_layer.predict(Z)
df = pd.DataFrame({ "PassengerId":range(892, 1310),
                   "Survived":pred_final

})
print(df)
df.to_csv('result.csv', index=False)  

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]
