# Load Library

In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as mtp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder 
import sklearn.metrics as sm
from scipy.optimize import curve_fit
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load Dataset

In [2]:
test_df = pd.read_csv('./Spaceship Titanic Kaggle/test.csv')
train_df = pd.read_csv('./Spaceship Titanic Kaggle/train.csv')

In [3]:
train_df.head()
# see the first 5 row of train data frame

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
test_df.head()
# see the first 5 row of test data frame

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [5]:
train_df.isnull().sum()
# checking null values of train

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [6]:
test_df.isnull().sum()
# checking null

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [7]:
train_df["Transported"]=train_df["Transported"].astype(int)

In [8]:
df= pd.concat([train_df,test_df],axis=0)


In [9]:
df[['Group','Number in Group']]= train_df['PassengerId'].str.split('_',expand=True)
# splitting into two new columns group and number in group 

In [10]:
df[['Block','Room Number','Side']]=train_df['Cabin'].str.split('/',expand=True)
# splitting cabin into three new columns deck, num, side

In [11]:
df

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Number in Group,Block,Room Number,Side
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0.0,0001,01,B,0,P
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1.0,0002,01,F,0,S
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0.0,0003,01,A,0,S
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0.0,0003,02,A,0,S
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1.0,0004,01,F,1,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter,,4549,01,E,298,S
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron,,4550,01,F,853,S
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,,4552,01,F,937,P
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,,4553,01,C,143,P


In [12]:
df= df.drop('PassengerId',axis =1)
df= df.drop('Cabin',axis =1)

In [13]:
df['HomePlanet'].value_counts()

HomePlanet
Earth     6865
Europa    3133
Mars      2684
Name: count, dtype: int64

In [14]:
df['CryoSleep'].value_counts()

CryoSleep
False    8079
True     4581
Name: count, dtype: int64

In [15]:
df['Destination'].value_counts()

Destination
TRAPPIST-1e      8871
55 Cancri e      2641
PSO J318.5-22    1184
Name: count, dtype: int64

In [16]:
df['VIP'].value_counts()

VIP
False    12401
True       273
Name: count, dtype: int64

In [17]:
df['Transported'].value_counts()

Transported
1.0    4378
0.0    4315
Name: count, dtype: int64

In [18]:
df['Block'].value_counts ()

Block
F    4134
G    3833
E    1303
B    1168
C    1096
D     738
A     377
T       9
Name: count, dtype: int64

In [19]:
df['Side'].value_counts()

Side
S    6392
P    6266
Name: count, dtype: int64

In [20]:
df['HomePlanet']=df['HomePlanet'].fillna('Earth')
df['CryoSleep']=df['CryoSleep'].fillna('False')
df['Destination']=df['Destination'].fillna('TRAPPIST-1e')
df['VIP']=df['Destination'].fillna('False')
df['Transported']=df['Transported'].fillna(1.0)
df['Block']=df['Block'].fillna('F')
df['Side']=df['Side'].fillna('S')

In [21]:
df['Room Number'] = df['Room Number'].astype(float)
df['Group'] = df['Group'].astype(float)
df['Number in Group'] = df['Number in Group'].astype(float)

In [22]:
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['RoomService'].fillna(df['RoomService'].mean(), inplace=True)
df['FoodCourt'].fillna(df['FoodCourt'].mean(), inplace=True)
df['ShoppingMall'].fillna(df['FoodCourt'].mean(), inplace=True)
df['Spa'].fillna(df['FoodCourt'].mean(), inplace=True)
df['VRDeck'].fillna(df['FoodCourt'].mean(), inplace=True)
df['Room Number'].fillna(df['Room Number'].mean(), inplace=True)

In [23]:
df.drop(['Name'],axis=1, inplace= True)

In [24]:
cat_cols=['HomePlanet','CryoSleep','Destination', 'VIP','Block','Side']

In [25]:
data_oh = pd.get_dummies(df, columns=cat_cols, dtype=int)

# Scaling

In [26]:
num_cols= ['Age','RoomService','FoodCourt','ShoppingMall','Spa', 'VRDeck','Room Number','Group','Number in Group']

In [27]:
data_oh[num_cols].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Room Number,Group,Number in Group
count,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0
mean,28.771969,222.897852,451.961675,181.442581,311.618744,309.789185,499.159346,3860.621049,1.516114
std,14.23671,640.996685,1566.618306,585.063311,1118.032719,1168.023008,460.582913,2564.18912,1.050177
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,128.0,1715.0,1.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,337.0,3485.0,1.0
75%,37.0,79.0,126.0,47.0,87.0,64.0,739.0,5814.0,2.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,1894.0,9280.0,8.0


In [28]:
data_std= data_oh.copy()

In [29]:
scaler =  StandardScaler()
data_std[num_cols] = scaler.fit_transform(data_std[num_cols])

In [30]:
data_oh[num_cols].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Room Number,Group,Number in Group
count,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0
mean,28.771969,222.897852,451.961675,181.442581,311.618744,309.789185,499.159346,3860.621049,1.516114
std,14.23671,640.996685,1566.618306,585.063311,1118.032719,1168.023008,460.582913,2564.18912,1.050177
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
25%,20.0,0.0,0.0,0.0,0.0,0.0,128.0,1715.0,1.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,337.0,3485.0,1.0
75%,37.0,79.0,126.0,47.0,87.0,64.0,739.0,5814.0,2.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,1894.0,9280.0,8.0


In [31]:
data_std[num_cols].describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Room Number,Group,Number in Group
count,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0,12970.0
mean,-1.884554e-16,-1.424372e-17,1.588723e-17,-1.972208e-17,-3.8348490000000004e-17,-3.46506e-17,3.506148e-17,0.0,1.068279e-16
std,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039,1.000039
min,-2.021048,-0.3477498,-0.2885062,-0.3101367,-0.2787313,-0.2652355,-1.083797,-1.505259,-0.4914735
25%,-0.6161751,-0.3477498,-0.2885062,-0.3101367,-0.2787313,-0.2652355,-0.805878,-0.836796,-0.4914735
50%,-0.1244695,-0.3477498,-0.2885062,-0.3101367,-0.2787313,-0.2652355,-0.3520877,-0.146493,-0.4914735
75%,0.577967,-0.2244994,-0.2080751,-0.2298004,-0.2009131,-0.2104399,0.5207529,0.761821,0.460784
max,3.528201,22.00424,18.74239,39.84433,19.76439,20.39697,3.028541,2.113568,6.174329


# Training

In [32]:
data_std

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Number in Group,Room Number,...,Block_A,Block_B,Block_C,Block_D,Block_E,Block_F,Block_G,Block_T,Side_P,Side_S
0,7.184543e-01,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,0.0,-1.505259,-0.491474,-1.083797,...,0,1,0,0,0,0,0,0,1,0
1,-3.352005e-01,-0.177696,-0.282761,-0.267405,0.212329,-0.227564,1.0,-1.504869,-0.491474,-1.083797,...,0,0,0,0,0,1,0,0,0,1
2,2.053084e+00,-0.280664,1.994205,-0.310137,5.727586,-0.223283,0.0,-1.504479,-0.491474,-1.083797,...,1,0,0,0,0,0,0,0,0,1
3,2.969924e-01,-0.347750,0.530487,0.324007,2.698935,-0.099993,0.0,-1.504479,0.460784,-1.083797,...,1,0,0,0,0,0,0,0,0,1
4,-8.971498e-01,0.124970,-0.243822,-0.052035,0.226640,-0.263523,1.0,-1.504089,-0.491474,-1.081626,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,3.672360e-01,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,1.0,0.268469,-0.491474,-0.436766,...,0,0,0,0,1,0,0,0,0,1
4273,9.291853e-01,-0.347750,0.252170,-0.281079,-0.269787,-0.141945,1.0,0.268859,-0.491474,0.768275,...,0,0,0,0,0,1,0,0,0,1
4274,-2.495556e-16,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,1.0,0.269639,-0.491474,0.950660,...,0,0,0,0,0,1,0,0,1,0
4275,-2.495556e-16,-0.347750,1.422251,-0.310137,-0.278731,0.182547,1.0,0.270029,-0.491474,-0.773309,...,0,0,1,0,0,0,0,0,1,0


### Splitting into training and testing

In [33]:
train= data_std[0:len(train_df)]
test= data_std[len(train_df):]

In [34]:
train

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Number in Group,Room Number,...,Block_A,Block_B,Block_C,Block_D,Block_E,Block_F,Block_G,Block_T,Side_P,Side_S
0,0.718454,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,0.0,-1.505259,-0.491474,-1.083797,...,0,1,0,0,0,0,0,0,1,0
1,-0.335201,-0.177696,-0.282761,-0.267405,0.212329,-0.227564,1.0,-1.504869,-0.491474,-1.083797,...,0,0,0,0,0,1,0,0,0,1
2,2.053084,-0.280664,1.994205,-0.310137,5.727586,-0.223283,0.0,-1.504479,-0.491474,-1.083797,...,1,0,0,0,0,0,0,0,0,1
3,0.296992,-0.347750,0.530487,0.324007,2.698935,-0.099993,0.0,-1.504479,0.460784,-1.083797,...,1,0,0,0,0,0,0,0,0,1
4,-0.897150,0.124970,-0.243822,-0.052035,0.226640,-0.263523,1.0,-1.504089,-0.491474,-1.081626,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.858942,-0.347750,4.064349,-0.310137,1.190871,-0.201878,0.0,2.112008,-0.491474,-0.871015,...,1,0,0,0,0,0,0,0,1,0
8689,-0.756662,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,0.0,2.112788,-0.491474,2.170899,...,0,0,0,0,0,0,1,0,0,1
8690,-0.194713,-0.347750,-0.288506,2.889640,-0.277837,-0.265235,1.0,2.113178,-0.491474,2.173071,...,0,0,0,0,0,0,1,0,0,1
8691,0.226749,-0.347750,0.381115,-0.310137,0.037014,2.504509,0.0,2.113568,-0.491474,0.236320,...,0,0,0,0,1,0,0,0,0,1


In [35]:
test

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Number in Group,Room Number,...,Block_A,Block_B,Block_C,Block_D,Block_E,Block_F,Block_G,Block_T,Side_P,Side_S
0,-1.244695e-01,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,1.0,-1.505259,-0.491474,-1.083797,...,0,1,0,0,0,0,0,0,1,0
1,-6.864188e-01,-0.347750,-0.282761,-0.310137,2.246337,-0.265235,1.0,-1.504869,-0.491474,-1.083797,...,0,0,0,0,0,1,0,0,0,1
2,1.565051e-01,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,1.0,-1.504479,-0.491474,-1.083797,...,1,0,0,0,0,0,0,0,0,1
3,6.482107e-01,-0.347750,3.957746,-0.310137,-0.116834,0.235630,1.0,-1.504479,0.460784,-1.083797,...,1,0,0,0,0,0,0,0,0,1
4,-6.161751e-01,-0.332148,-0.288506,0.775258,-0.278731,-0.265235,1.0,-1.504089,-0.491474,-1.081626,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,3.672360e-01,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,1.0,0.268469,-0.491474,-0.436766,...,0,0,0,0,1,0,0,0,0,1
4273,9.291853e-01,-0.347750,0.252170,-0.281079,-0.269787,-0.141945,1.0,0.268859,-0.491474,0.768275,...,0,0,0,0,0,1,0,0,0,1
4274,-2.495556e-16,-0.347750,-0.288506,-0.310137,-0.278731,-0.265235,1.0,0.269639,-0.491474,0.950660,...,0,0,0,0,0,1,0,0,1,0
4275,-2.495556e-16,-0.347750,1.422251,-0.310137,-0.278731,0.182547,1.0,0.270029,-0.491474,-0.773309,...,0,0,1,0,0,0,0,0,1,0


### Splitting features

In [36]:
y= train['Transported']
X= train.drop(['Transported'], axis=1)

In [37]:
X_test= test.drop('Transported', axis=1)

### Randomized Search

In [38]:
model= DecisionTreeClassifier(criterion= "entropy")
params= {"max_depth": range(5,33),
         "min_samples_split": range(2,11), "min_samples_leaf": range(1,11), 
         "max_leaf_nodes":range (10,101)}
grid= RandomizedSearchCV(estimator= model, param_distributions= params, cv=5, n_iter=100,scoring= "accuracy", verbose=1)

In [39]:
grid.fit(X,y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [40]:
grid.best_score_,grid.best_params_

(0.7706215825795681,
 {'min_samples_split': 7,
  'min_samples_leaf': 10,
  'max_leaf_nodes': 11,
  'max_depth': 17})

In [41]:
best_params=grid.best_params_
grid.best_estimator_

# Cross Validation with submission sample csv


In [42]:
submission = pd.read_csv('./Spaceship Titanic Kaggle/sample_submission.csv')

In [43]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [47]:
submission['total'] = 0


In [48]:
kfold=KFold(n_splits=5, shuffle=False)

In [49]:
scores = []

for (tr_idx, val_idx) in kfold.split(X):
    
    #training and validation subsets
    X_tr, y_tr, X_val, y_val = X.iloc[tr_idx], y.iloc[tr_idx], X.iloc[val_idx], y.iloc[val_idx]
    
    #training with optimized hyperparameters
    model = DecisionTreeClassifier(criterion="entropy", **best_params)
    model.fit(X_tr, y_tr)
    
    #predictions
    pred = model.predict(X_test)

    #Collect prediction of each fold
    submission['total'] += pred

In [50]:
submission["total"].value_counts()

total
5.0    2371
0.0    1562
2.0     153
4.0     100
3.0      52
1.0      39
Name: count, dtype: int64

In [51]:
submission['Transported'] = submission['total'].apply(lambda x:1 if x>2.5 else 0)

In [52]:
submission[(submission['total']!=5) & (submission['total']!=0)]

Unnamed: 0,PassengerId,Transported,total
5,0027_01,1,4.0
10,0037_01,1,3.0
18,0047_02,0,2.0
24,0054_03,0,2.0
98,0226_01,0,2.0
...,...,...,...
4209,9138_01,0,1.0
4217,9153_01,1,4.0
4219,9155_01,0,1.0
4227,9171_01,0,2.0


In [53]:
submission['Transported']=submission['Transported'].astype(bool)

In [80]:
submission

Unnamed: 0,PassengerId,Transported,total
0,0013_01,True,5.0
1,0018_01,False,0.0
2,0019_01,True,5.0
3,0021_01,True,5.0
4,0023_01,True,5.0
...,...,...,...
4272,9266_02,True,5.0
4273,9269_01,True,5.0
4274,9271_01,True,5.0
4275,9273_01,False,0.0


In [81]:
# Ensure X_submission has the same columns as the training dataframe
X_submission = submission.reindex(columns=X.columns, fill_value=0)

y_pred = model.predict(X_submission.values)

y_true = submission['Transported']
accuracy = accuracy_score(y_true, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5898994622398878


