# ML Spaceship Titanic
After solving titanic, we try solving https://www.kaggle.com/c/spaceship-titanic/data

### Imports and converting file to dataframe

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [32]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [33]:

# Loading data into pandas dataframe
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

count = 0
for i in train["Transported"]:
    if i==1:
        count +=1 

# Figuring out some key characteristics of the data
print("Survival rate:", count/len(train["Transported"]))

le = len(train)
train_dropna = train.dropna()
print("data length", le, "-->", len(train_dropna), "if we drop NA")
print()
print(train.head())
print(train.shape)
print(test.shape)

Survival rate: 0.5036236051995858
data length 8693 --> 6606 if we drop NA

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        Fal

### Preparing dataset

I filled in the NaN with the median/mean/mode, and converted categorical data to T/F data by using one-hot-encoding. I used the standard scalar (built into scikit learn) to scale the numercal data.


In [55]:
def process_data(df):
    #replace missing values
    replacements = {"HomePlanet":df['HomePlanet'].mode()[0],
                    "CryoSleep":df['CryoSleep'].mode()[0],
                    "Cabin":"U/0/U", #U for unknown
                    "Destination": df['CryoSleep'].mode()[0],
                    "Age": df['Age'].median(),
                    "VIP": df['VIP'].mode()[0], #using mode since many people consumed zero
                    "RoomService": df['RoomService'].mode()[0],
                    "FoodCourt": df['FoodCourt'].mode()[0],
                    "ShoppingMall": df['ShoppingMall'].mode()[0],
                    "Spa": df['Spa'].mode()[0],
                    "VRDeck": df['VRDeck'].mode()[0],
                    #If last name is the same they might be related... not sure how to handle that
                    }
    
    df = df.fillna(value=replacements)  #this doesn't modify inplace!

    df['Cabin'] = df['Cabin'].fillna('U/0/U').astype(str) #(says ChatGPT. I didn't know about "astype")
    # Convert cabin to 3 numbers
    
    df['deck'] = df['Cabin'].map(lambda x: str(x).split("/")[0])
    df['num'] = df['Cabin'].map(lambda x: str(x).split("/")[1])
    df['side'] = df['Cabin'].map(lambda x: str(x).split("/")[2])

    #Drop columns
    df = df.drop(columns=["PassengerId", "Name", "Cabin"]) 

    # one-hot-encoding
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', "deck", "side"])

    return df

Passenger_ID_testing_data = test["PassengerId"]
train1 = process_data(train)
test1 = process_data(test)
labels = train1["Transported"]
train1 = train1.drop(columns = ["Transported"]) #This is y

print(train1.columns)
print(test1.columns)
# print(labels.head())
print(train1.shape)
print(test1.shape)
print(labels.shape)

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
(8693, 28)
(4277, 28)
(8693,)


  df = df.fillna(value=replacements)  #this doesn't modify inplace!
  df = df.fillna(value=replacements)  #this doesn't modify inplace!


**Scaling**
scaling numerical data using standard scalar

In [57]:

from sklearn.preprocessing import StandardScaler
scalar_age = StandardScaler() 
scalar_fare = StandardScaler() #need two seperate scalars!

train2 = train1
test2 = test1

# List of columns to scale
columns = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

# Scale each column individually and store the scaler
for col in columns:
    scaler = StandardScaler()
    train2[col] = scaler.fit_transform(train1[[col]])

for col in columns:
    scaler = StandardScaler()
    test2[col] = scaler.fit_transform(test1[[col]])

In [58]:
y = labels
X = train2
Z = test2
print(X.columns)
print(y.head(5))
print(Z.columns)
print(X.shape)
print(Z.shape)

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
0    False
1     True
2    False
3    False
4     True
Name: Transported, dtype: bool
Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
(8693, 28)
(4277, 28)


In [59]:
print(Passenger_ID_testing_data.shape)
print(Z.shape)

(4277,)
(4277, 28)


## Fitting to model

### KNN
The k-nearest neighbour method was used; GridSearchCV was used to search for the best k-parameter

In [60]:
knn = KNeighborsRegressor()
knn_mod = GridSearchCV(estimator = knn,
             param_grid = {'n_neighbors': list(range(14, 37))})
knn_mod.fit(X, y)
results = pd.DataFrame(knn_mod.cv_results_)

In [61]:
print(pd.DataFrame(results[["param_n_neighbors","rank_test_score"]]))
# best results when k=25
pred_knn = knn_mod.predict(X)
pred_knn = [int(x) for x in pred_knn]

print("The percentage accuracy is", f'{100* accuracy_score(pred_knn, y):.2f}')

    param_n_neighbors  rank_test_score
0                  14               23
1                  15               22
2                  16                5
3                  17                2
4                  18                1
5                  19                3
6                  20                4
7                  21               15
8                  22               18
9                  23               21
10                 24               20
11                 25               19
12                 26               17
13                 27               16
14                 28               14
15                 29               12
16                 30                7
17                 31               13
18                 32                6
19                 33                9
20                 34               10
21                 35                8
22                 36               11
The percentage accuracy is 50.20


In [62]:
best_mod_KNN = KNeighborsRegressor(n_neighbors= 25).fit(X,y)

### Neural Network

I tested nerual networks with 1 and 2 hidden layers

In [65]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1, random_state=1, max_iter=500)
NN_mod_one_layer = GridSearchCV(estimator = clf,
             param_grid = {'hidden_layer_sizes': (range(10, 20))}) 
NN_mod_one_layer.fit(X, y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [None]:
def iterate_layers(min_i, max_i, min_j, max_j):
    for i in range(min_i, max_i):
        for j in range(min_j, max_j): 
            yield (i,j)

NN_mod_two_layer = GridSearchCV(estimator = clf,
             param_grid = {'hidden_layer_sizes': list(iterate_layers(20,25,5,10))})
NN_mod_one_layer.fit(X, y)
NN_mod_two_layer.fit(X, y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

TypeError: 'MLPClassifier' object is not callable

In [68]:
print(pd.DataFrame(NN_mod_one_layer.cv_results_)[["param_hidden_layer_sizes", "rank_test_score"]])
print(pd.DataFrame(NN_mod_two_layer.cv_results_)[["param_hidden_layer_sizes", "rank_test_score"]])

   param_hidden_layer_sizes  rank_test_score
0                        10                8
1                        11                6
2                        12                7
3                        13               10
4                        14                4
5                        15                1
6                        16                2
7                        17                3
8                        18                9
9                        19                5
   param_hidden_layer_sizes  rank_test_score
0                   (20, 5)               10
1                   (20, 6)               20
2                   (20, 7)               16
3                   (20, 8)               24
4                   (20, 9)                7
5                   (21, 5)               12
6                   (21, 6)               11
7                   (21, 7)                6
8                   (21, 8)                4
9                   (21, 9)                2
10        

In [69]:
best_mod_one_layer_NN = MLPClassifier(hidden_layer_sizes=(15))
best_mod_one_layer_NN.fit(X,y)

In [72]:
best_mod_two_layer_NN = MLPClassifier(hidden_layer_sizes=(24, 7))
best_mod_two_layer_NN.fit(X,y)
pred_two_layer = best_mod_two_layer_NN.predict(X)
best_mod_two_layer_NN1 = MLPClassifier(hidden_layer_sizes=(25, 7))
best_mod_two_layer_NN1.fit(X,y)
pred_two_layer1 = best_mod_two_layer_NN1.predict(X)
# best_mod_two_layer_NN = MLPClassifier(hidden_layer_sizes=(24, 7))
# best_mod_two_layer_NN.fit(X,y)
print(accuracy_score(pred_two_layer, y))
print(accuracy_score(pred_two_layer1, y))

0.7966179684803865
0.7990337052801104


In [73]:
pred_NN_mod = NN_mod_one_layer.predict(X)
print("The best percentage accuracy is", f'{100* accuracy_score(pred_NN_mod, y):.2f}', "one layer, with hidden_layer = 13")

pred_NN_mod = NN_mod_two_layer.predict(X)
print("The best percentage accuracy is", f'{100* accuracy_score(pred_NN_mod, y):.2f}', "two layers, with hidden_layer = (22, 9)")

The best percentage accuracy is 79.39 one layer, with hidden_layer = 13
The best percentage accuracy is 78.96 two layers, with hidden_layer = (22, 9)


## Decision Tree

In [74]:
mod_dtree = DecisionTreeClassifier().fit(X, y)
pred_dtree_mod = mod_dtree.predict(X)
print("The accuracy score is", f'{100* accuracy_score(pred_dtree_mod, y):.2f}', ", high as expected")

The accuracy score is 99.94 , high as expected


Random Forest

In [94]:
from sklearn.ensemble import RandomForestClassifier

clf_forest = RandomForestClassifier(n_estimators=10)
mod_forest = GridSearchCV(estimator = clf_forest, param_grid={"n_estimators": list(range(5, 18))})
mod_forest.fit(X, y)
pred_forest = mod_forest.predict(X)


In [95]:
pd.DataFrame(mod_forest.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.072835,0.014874,0.006257,0.000849,5,{'n_estimators': 5},0.740081,0.728005,0.785509,0.788262,0.757192,0.75981,0.02399,13
1,0.078907,0.001866,0.007521,0.000796,6,{'n_estimators': 6},0.756182,0.743531,0.775733,0.804373,0.750288,0.766021,0.021979,12
2,0.093554,0.001856,0.007216,0.000847,7,{'n_estimators': 7},0.740656,0.752156,0.80276,0.807825,0.769275,0.774534,0.026763,6
3,0.104557,0.002749,0.007608,0.000658,8,{'n_estimators': 8},0.748131,0.756182,0.791834,0.795742,0.758918,0.770162,0.019653,10
4,0.114746,0.002112,0.008283,0.001074,9,{'n_estimators': 9},0.751006,0.756757,0.792409,0.8084,0.771001,0.775915,0.021629,5
5,0.132377,0.014097,0.008659,0.000553,10,{'n_estimators': 10},0.743531,0.736055,0.786084,0.805524,0.770426,0.768324,0.025922,11
6,0.144595,0.00827,0.008765,0.0005,11,{'n_estimators': 11},0.752156,0.758482,0.783209,0.81473,0.752014,0.772118,0.024196,9
7,0.151959,0.004309,0.009653,0.00074,12,{'n_estimators': 12},0.752731,0.747556,0.788959,0.811277,0.767549,0.773615,0.023689,8
8,0.166777,0.009795,0.009781,0.000798,13,{'n_estimators': 13},0.756757,0.752156,0.794135,0.810127,0.781358,0.778906,0.021994,3
9,0.170041,0.004453,0.010003,0.000448,14,{'n_estimators': 14},0.754457,0.751581,0.797585,0.810127,0.772152,0.77718,0.023233,4


In [76]:
print(pd.DataFrame(mod_forest.cv_results_)[["param_n_estimators", "rank_test_score"]])
print("The best percentage accuracy is", f'{100* accuracy_score(pred_forest, y):.2f}')

    param_n_estimators  rank_test_score
0                    3               12
1                    4               11
2                    5               10
3                    6                9
4                    7                7
5                    8                5
6                    9                8
7                   10                4
8                   11                6
9                   12                2
10                  13                1
11                  14                3
The best percentage accuracy is 99.19


In [77]:
#extra random

from sklearn.ensemble import ExtraTreesClassifier

clf_extraTree = ExtraTreesClassifier()
mod_extraTree = GridSearchCV(estimator = clf_extraTree, param_grid={"n_estimators": list(range(3, 10))})
mod_extraTree.fit(X, y)
pred_extraTree = mod_extraTree.predict(X)

In [78]:
print(pd.DataFrame(mod_extraTree.cv_results_)[["param_n_estimators", "rank_test_score"]])
print("The best percentage accuracy is", f'{100* accuracy_score(pred_extraTree, y):.2f}')

   param_n_estimators  rank_test_score
0                   3                7
1                   4                6
2                   5                5
3                   6                4
4                   7                1
5                   8                2
6                   9                3
The best percentage accuracy is 99.94


## Final Prediction
Export prediction in a CSV file to submit on kaggle

In [88]:
pred_final_knn = best_mod_KNN.predict(Z)
pred_final_knn = np.round(pred_final_knn).astype(int)  # This ensures this is an integer array (says ChatGPT)
pred_final_knn = pred_final_knn.astype(bool)

df = pd.DataFrame({ "PassengerId": Passenger_ID_testing_data,
                   "Transported": pred_final_knn
})
df.to_csv('resultKNN.csv', index=False) 

In [91]:
#generates a bunch of CSVs

models = [best_mod_one_layer_NN, best_mod_two_layer_NN, mod_dtree, mod_forest, mod_extraTree]
#do seperate thing for KNN

for i in range(len(models)):
    pred = models[i].predict(Z)
    df = pd.DataFrame({ "PassengerId":Passenger_ID_testing_data,
                   "Transported":pred})
    pred = pred.astype(bool)
    print(models[i], df)
    df.to_csv("result"+str(models[i])+".csv", index=False)  





MLPClassifier(hidden_layer_sizes=15)      PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3        0021_01         True
4        0023_01         True
...          ...          ...
4272     9266_02         True
4273     9269_01        False
4274     9271_01         True
4275     9273_01         True
4276     9277_01         True

[4277 rows x 2 columns]
MLPClassifier(hidden_layer_sizes=(24, 7))      PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3        0021_01         True
4        0023_01         True
...          ...          ...
4272     9266_02         True
4273     9269_01         True
4274     9271_01         True
4275     9273_01         True
4276     9277_01         True

[4277 rows x 2 columns]
DecisionTreeClassifier()      PassengerId  Transported
0        0013_01         True
1        0018_01        False
2        0019_01         True
3     