## Problem: https://www.kaggle.com/competitions/spaceship-titanic

## Solution by Rohan Jha

In [61]:
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
import optuna

In [156]:
#For Training
df = pd.read_csv("space-titanic.csv")  

In [157]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [158]:
df["PassengerId"] = df["PassengerId"].str.split('_' , expand=True)[1]
df["PassengerId"] = df["PassengerId"].astype(int)
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,1,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,1,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,2,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,1,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [159]:
df.isna().sum()

PassengerId        0
HomePlanet       288
CryoSleep        310
Cabin            299
Destination      274
Age              270
VIP              296
RoomService      263
FoodCourt        289
ShoppingMall     306
Spa              284
VRDeck           268
Name             294
Transported     4277
dtype: int64

In [160]:
df.dtypes

PassengerId       int32
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported      object
dtype: object

In [161]:
df.dtypes.value_counts()

object     7
float64    6
int32      1
dtype: int64

In [162]:
df["PassengerId"].value_counts()

1    9280
2    2135
3     840
4     338
5     184
6     108
7      66
8      19
Name: PassengerId, dtype: int64

# 1. Preprocessing

In [163]:
df.dropna(subset=["Transported"] , inplace=True) #drop empty values for target

In [164]:
df["Cabin"] = df["Cabin"].astype(str) #has deck/num/side
cab = df["Cabin"].str.split('/' , expand=True)
df["Deck"] = cab[0]
df["Num"] = cab[1]
df["Side"] = cab[2]

In [165]:
df['HomePlanet'].fillna("Earth" , inplace=True)
df['CryoSleep'].fillna("False" , inplace=True)
df['Destination'].fillna("TRAPPIST-1e" , inplace=True)
df['Age'].fillna(df["Age"].mean() , inplace=True)
df['VIP'].fillna("False" , inplace=True) 
df['RoomService'].fillna(df["RoomService"].mean() , inplace=True)
df['FoodCourt'].fillna(df["FoodCourt"].mean() , inplace=True)
df['ShoppingMall'].fillna(df["ShoppingMall"].mean() , inplace=True)
df['Spa'].fillna(df["Spa"].mean() , inplace=True)
df['VRDeck'].fillna(df["VRDeck"].mean() , inplace=True)

df['Num'].fillna('0', inplace=True)
df["Nums"] = df["Num"].astype(int)
df['Num'].replace(0,df["Num"].mean() , inplace=True)

df['Side'].fillna("missing" , inplace=True)


In [167]:
#Dealing catagoricals for Transported
Transported = {True: int(1) , False: int(0) , 'False':int(0), "False" : 0 }
df["Transported"] = df["Transported"].apply(lambda x: Transported.get(x))
df["CryoSleep"] = df["CryoSleep"].apply(lambda x: Transported.get(x)) 
df["VIP"] = df["VIP"].apply(lambda x: Transported.get(x)) 
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,Nums
0,1,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P,0
1,1,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S,0
2,1,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S,0
3,2,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S,0
4,1,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S,1


In [168]:
# Dropping columns that are not required
df.drop(columns=["Cabin" , "Name"] , axis=1 , inplace=True) 
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side,Nums
0,1,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0,B,0,P,0
1,1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,1,F,0,S,0
2,1,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,A,0,S,0
3,2,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,A,0,S,0
4,1,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,1,F,1,S,1


### Separating target column

In [169]:
X = df.drop(columns = ["Transported" ] , axis=1)
y = df["Transported"]

In [170]:
X.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,Side,Nums
0,1,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,B,0,P,0
1,1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,F,0,S,0
2,1,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,A,0,S,0
3,2,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,A,0,S,0
4,1,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,F,1,S,1


### Using OneHotEncoding for remaining catagorical data

In [171]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
catagories = ["HomePlanet" , "Destination" , "Deck" , "Side" , "PassengerId"]
transform = ColumnTransformer([("one_hot" , one_hot , catagories)] , remainder="passthrough")
X_transform = transform.fit_transform(X)

In [172]:
len(X_transform)

8693

## ML model train and test

In [173]:
from sklearn.model_selection import train_test_split , cross_val_score

X_train , X_test , y_train , y_test = train_test_split(X_transform,y, test_size=0.2)


In [174]:
np.random.seed(40)

### Try randomforest

In [175]:
from sklearn.ensemble import RandomForestClassifier

random_clf = RandomForestClassifier(n_estimators=120)
random_clf.fit(X_train , y_train)
random_clf.score(X_test , y_test)

0.8062104657849338

### Try SVM

In [176]:
from sklearn import svm
svm_clf = svm.SVC()
svm_clf.fit(X_train , y_train)
svm_clf.score(X_test , y_test)

0.7740080506037953

### Try GradientBoostingClassifier

In [177]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=120, learning_rate=1, max_depth=1, random_state=0).fit(X_train, y_train)
gb_clf.score(X_test , y_test)

0.7975848188614146

### Using Cat Boost

In [178]:
import catboost as ctb

In [198]:
best_params = {'learning_rate': 0.018049356549743555,
 'depth': 6,
 'l2_leaf_reg': 7.838880563296214,
 'border_count': 180,
 'verbose' : False} #got by using GridSearchCV

model_CBC = ctb.CatBoostClassifier(**best_params)
model_CBC.fit(X_train, y_train)
model_CBC.score(X_test , y_test)

0.8004600345025877

### XGBoost

In [180]:
import lightgbm as lgb
from xgboost import XGBClassifier

clf_xgb = XGBClassifier('binary:logistic',
    colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=1500,
                             reg_alpha=4.5, reg_lambda=8.5,
                             subsample=0.5213,
                             random_state=42)
x = X_train
y = y_train


clf_lgb = lgb.LGBMClassifier()

cv_xgb = cross_val_score(clf_xgb, x, y, cv=5, scoring='accuracy').mean()
cv_lgbm = cross_val_score(clf_lgb, x, y, cv=5, scoring='accuracy').mean()


clf_xgb.fit(x, y,
        
        verbose=False)

clf_lgb.fit(x, y,
        
        verbose=False)


print((clf_xgb.score(X_test , y_test) , clf_lgb.score(X_test , y_test)))



(0.8062104657849338, 0.8125359401955147)




### For iterating and finding best value (Tuning hyper params) No need to run this

In [199]:
best_sc = 0
for i in range(10,200 ,10):
#     clf = RandomForestClassifier(n_estimators=i)
    best_params = {'learning_rate': 0.018049356549743555,
                   'depth': 6,
                   'l2_leaf_reg': 7.838880563296214,
                   'border_count': 180,
                   'verbose' : False}
    
    model_CBC = ctb.CatBoostClassifier(**best_params)
    model_CBC.fit(X_train, y_train)
    sc = model_CBC.score(X_test , y_test)
    if sc > best_sc:
        best_sc = sc
        print(f"score = {sc} for n_estimators = {i}")
        print('*' * 10)

score = 0.8004600345025877 for n_estimators = 10
**********
score = 0.81196089706728 for n_estimators = 20
**********
score = 0.8142610695802185 for n_estimators = 30
**********
score = 0.816561242093157 for n_estimators = 70
**********
score = 0.8177113283496262 for n_estimators = 80
**********


## Predict

In [181]:
#For testing and generating Results
df = pd.read_csv("test.csv")
df_passenger = df["PassengerId"]

In [182]:
df["Cabin"] = df["Cabin"].astype(str) #has deck/num/side
cab = df["Cabin"].str.split('/' , expand=True)
df["Deck"] = cab[0]
df["Num"] = cab[1]
df["Side"] = cab[2]

df['HomePlanet'].fillna("Earth" , inplace=True)
df['CryoSleep'].fillna("False" , inplace=True)
df['Destination'].fillna("TRAPPIST-1e" , inplace=True)
df['Age'].fillna(df["Age"].mean() , inplace=True)
df['VIP'].fillna("False" , inplace=True) 
df['RoomService'].fillna(df["RoomService"].mean() , inplace=True)
df['FoodCourt'].fillna(df["FoodCourt"].mean() , inplace=True)
df['ShoppingMall'].fillna(df["ShoppingMall"].mean() , inplace=True)
df['Spa'].fillna(df["Spa"].mean() , inplace=True)
df['VRDeck'].fillna(df["VRDeck"].mean() , inplace=True)
df['Num'].fillna('0', inplace=True)
df["Nums"] = df["Num"].astype(int)
df['Num'].replace(0,df["Num"].mean() , inplace=True)
df['Side'].fillna("missing" , inplace=True)



Transported = {True: int(1) , False: int(0) , 'False':int(0), "False" : 0 }
df["CryoSleep"] = df["CryoSleep"].apply(lambda x: Transported.get(x)) 
df["VIP"] = df["VIP"].apply(lambda x: Transported.get(x)) 

df["PassengerId"] = df["PassengerId"].str.split('_' , expand=True)[1]
df["PassengerId"] = df["PassengerId"].astype(int)

df.drop(columns=["Cabin" , "Name" ] , axis=1 , inplace=True) 



In [183]:
one_hot = OneHotEncoder()
catagories = ["HomePlanet" , "Destination" , "Deck" , "Side" , "PassengerId"]
transform = ColumnTransformer([("one_hot" , one_hot , catagories)] , remainder="passthrough")
X_transform_test = transform.fit_transform(df)

In [184]:
op = model_CBC.predict(X_transform_test)

In [185]:
new_df = pd.DataFrame({"PassengerId": df_passenger,  "Transported": op})

In [186]:
Reverted = {1: 'True' , 0: 'False' }

In [187]:
new_df["Transported"] = new_df["Transported"].apply(lambda x: Reverted.get(x))


In [188]:
new_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [189]:
new_df.to_excel("predicted.xlsx" , index=False )