# ML Spaceship Titanic
After solving titanic, we try solving https://www.kaggle.com/c/spaceship-titanic/data

### Imports and converting file to dataframe

In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

In [99]:

# Loading data into pandas dataframe
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

count = 0
for i in train["Transported"]:
    if i==1:
        count +=1 

# Figuring out some key characteristics of the data
print("Survival rate:", count/len(train["Transported"]))

le = len(train)
train_dropna = train.dropna()
print("data length", le, "-->", len(train_dropna), "if we drop NA")
print()
print(train.head())
print(train.shape)
print(test.shape)

Survival rate: 0.5036236051995858
data length 8693 --> 6606 if we drop NA

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        Fal

### Preparing dataset

I filled in the NaN with the median/mean/mode, and converted categorical data to T/F data by using one-hot-encoding. I used the standard scalar (built into scikit learn) to scale the numercal data.


In [121]:
PassengerIDs_array = test["PassengerId"]
print(PassengerIDs_array)

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object


In [101]:
def process_data(df):
    #replace missing values
    replacements = {"HomePlanet":df['HomePlanet'].mode()[0],
                    "CryoSleep":df['CryoSleep'].mode()[0],

                    "Destination": df['CryoSleep'].mode()[0],
                    "Age": df['Age'].median(),
                    "VIP": df['VIP'].mode()[0],
                    "RoomService": df['RoomService'].mode()[0],
                    "FoodCourt": df['FoodCourt'].mode()[0],
                    "ShoppingMall": df['ShoppingMall'].mode()[0],
                    "Spa": df['Spa'].mode()[0],
                    "VRDeck": df['VRDeck'].mode()[0],
                    #If last name is the same they might be related... not sure how to handle that
                    }
    
    df = df.fillna(value=replacements)  #this doesn't modify inplace!

    df['Cabin'] = df['Cabin'].fillna('U/0/U').astype(str)
    df[['deck', 'num', 'side']] = df['Cabin'].str.split('/', expand=True)


    #Drop columns
    df = df.drop(columns=["PassengerId", "Name", "Cabin"]) 

    # one-hot-encoding
    df = pd.get_dummies(df, columns=['HomePlanet', 'Destination', "deck", "side"])
    return df

train1 = process_data(train)
test1 = process_data(test)
labels = train1["Transported"]
train1 = train1.drop(columns = ["Transported"]) #This is y

print(train1.columns)
print(test1.columns)
# print(labels.head())
print(train1.shape)
print(test1.shape)
print(labels.shape)

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
(8693, 28)
(4277, 28)
(8693,)


  df = df.fillna(value=replacements)  #this doesn't modify inplace!
  df = df.fillna(value=replacements)  #this doesn't modify inplace!


**Scaling**
scaling numerical data using standard scalar

In [102]:

from sklearn.preprocessing import StandardScaler
scalar_age = StandardScaler() 
scalar_fare = StandardScaler() #need two seperate scalars!

train2 = train1
test2 = test1

# List of columns to scale
columns = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

# Scale each column individually and store the scaler
for col in columns:
    scaler = StandardScaler()
    train2[col] = scaler.fit_transform(train1[[col]])

for col in columns:
    scaler = StandardScaler()
    test2[col] = scaler.fit_transform(test1[[col]])

In [106]:
y = labels
X = train2
Z = test2
print(X.columns)
print(y.head(5))
print(Z.columns)
print(X.shape)
print(Z.shape)

Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
0    False
1     True
2    False
3    False
4     True
Name: Transported, dtype: bool
Index(['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'num', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_False', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'deck_A',
       'deck_B', 'deck_C', 'deck_D', 'deck_E', 'deck_F', 'deck_G', 'deck_T',
       'deck_U', 'side_P', 'side_S', 'side_U'],
      dtype='object')
(8693, 28)
(4277, 28)


## Fitting to model

### KNN
The k-nearest neighbour method was used; GridSearchCV was used to search for the best k-parameter

In [105]:
knn = KNeighborsRegressor()
knn_mod = GridSearchCV(estimator = knn,
             param_grid = {'n_neighbors': list(range(14, 37))})
knn_mod.fit(X, y)
results = pd.DataFrame(knn_mod.cv_results_)

In [107]:
print(pd.DataFrame(results[["param_n_neighbors","rank_test_score"]]))

pred_knn = knn_mod.predict(X)
pred_knn = [int(x) for x in pred_knn]
from sklearn.metrics import accuracy_score
print("The percentage accuracy is", f'{100* accuracy_score(pred_knn, y):.2f}')

    param_n_neighbors  rank_test_score
0                  14               23
1                  15               22
2                  16                5
3                  17                2
4                  18                1
5                  19                3
6                  20                4
7                  21               15
8                  22               18
9                  23               21
10                 24               20
11                 25               19
12                 26               17
13                 27               16
14                 28               14
15                 29               12
16                 30                7
17                 31               13
18                 32                6
19                 33                9
20                 34               10
21                 35                8
22                 36               11
The percentage accuracy is 50.20


### Neural Network

I tested nerual networks with 1 and 2 hidden layers

In [110]:
clf = MLPClassifier(solver='lbfgs', alpha=1, random_state=1, max_iter=500)
NN_mod_one_layer = GridSearchCV(estimator = clf,
             param_grid = {'hidden_layer_sizes': (range(10, 20))})  
NN_mod_one_layer.fit(X, y)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [111]:
def iterate_layers(min_i, max_i, min_j, max_j):
    for i in range(min_i, max_i):
        for j in range(min_j, max_j): 
            yield (i,j)

NN_mod_two_layer = GridSearchCV(estimator = clf,
             param_grid = {'hidden_layer_sizes': list(iterate_layers(20,25,5,10))})
NN_mod_two_layer.fit(X, y)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [112]:
print(pd.DataFrame(NN_mod_one_layer.cv_results_)[["param_hidden_layer_sizes", "rank_test_score"]])
print(pd.DataFrame(NN_mod_two_layer.cv_results_)[["param_hidden_layer_sizes", "rank_test_score"]])

   param_hidden_layer_sizes  rank_test_score
0                        10                8
1                        11                6
2                        12                7
3                        13               10
4                        14                4
5                        15                1
6                        16                2
7                        17                3
8                        18                9
9                        19                5
   param_hidden_layer_sizes  rank_test_score
0                   (20, 5)               10
1                   (20, 6)               20
2                   (20, 7)               16
3                   (20, 8)               24
4                   (20, 9)                7
5                   (21, 5)               12
6                   (21, 6)               11
7                   (21, 7)                6
8                   (21, 8)                4
9                   (21, 9)                2
10        

In [113]:
from sklearn.metrics import accuracy_score

pred_NN_mod = NN_mod_one_layer.predict(X)
print("The best percentage accuracy is", f'{100* accuracy_score(pred_NN_mod, y):.2f}', "one layer, with hidden_layer = 13")

pred_NN_mod = NN_mod_two_layer.predict(X)
print("The best percentage accuracy is", f'{100* accuracy_score(pred_NN_mod, y):.2f}', "two layers, with hidden_layer = (22, 9)")

The best percentage accuracy is 79.39 one layer, with hidden_layer = 13
The best percentage accuracy is 78.96 two layers, with hidden_layer = (22, 9)


## Decision Tree

In [114]:
mod_dtree = DecisionTreeClassifier().fit(X, y)
pred_dtree_mod = mod_dtree.predict(X)
print("The accuracy score is", f'{100* accuracy_score(pred_dtree_mod, y):.2f}', ", high as expected")

The accuracy score is 99.94 , high as expected


Random Forest

In [115]:
from sklearn.ensemble import RandomForestClassifier

clf_forest = RandomForestClassifier(n_estimators=10)
mod_forest = GridSearchCV(estimator = clf_forest, param_grid={"n_estimators": list(range(3, 15))})
mod_forest.fit(X, y)
pred_forest = mod_forest.predict(X)


In [116]:
print(pd.DataFrame(mod_forest.cv_results_)[["param_n_estimators", "rank_test_score"]])
print("The best percentage accuracy is", f'{100* accuracy_score(pred_forest, y):.2f}')

    param_n_estimators  rank_test_score
0                    3               12
1                    4               11
2                    5               10
3                    6                9
4                    7                8
5                    8                6
6                    9                7
7                   10                4
8                   11                5
9                   12                2
10                  13                3
11                  14                1
The best percentage accuracy is 99.11


In [117]:
#extra random

from sklearn.ensemble import ExtraTreesClassifier

clf_extraTree = ExtraTreesClassifier()
mod_extraTree = GridSearchCV(estimator = clf_extraTree, param_grid={"n_estimators": list(range(3, 10))})
mod_extraTree.fit(X, y)
pred_extraTree = mod_extraTree.predict(X)

In [118]:
print(pd.DataFrame(mod_extraTree.cv_results_)[["param_n_estimators", "rank_test_score"]])
print("The best percentage accuracy is", f'{100* accuracy_score(pred_extraTree, y):.2f}')

   param_n_estimators  rank_test_score
0                   3                6
1                   4                7
2                   5                3
3                   6                5
4                   7                4
5                   8                1
6                   9                2
The best percentage accuracy is 99.94


## Final Prediction
Loops through the models and exports the predictions as CSV files

In [126]:
predictions = [
    ("NN_one", NN_mod_one_layer),
    ("NN_two", NN_mod_two_layer),
    ("dtree", mod_dtree),
    ("forest", mod_forest),
    ("extra", mod_extraTree)
]

for name, pred in predictions:
    df = pd.DataFrame({
        "PassengerId": PassengerIDs_array,
        "Transported": pred.predict(Z)
    })
    filename = f"result_{name}.csv"
    df.to_csv(filename, index=False)



In [127]:
pred_final_knn = knn_mod.predict(Z)
pred_final_knn = np.round(pred_final_knn).astype(int)  # This ensures this is an integer array (says ChatGPT)
df = pd.DataFrame({ "PassengerId":PassengerIDs_array,
                   "Transported": pred_final_knn
})
df.to_csv('resultKNN.csv', index=False) 