In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
df_test['Transported'] = False

In [5]:
df = pd.concat([df_train,df_test],sort= False)

In [6]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
df[['Deck','Num','Side']] = df['Cabin'].str.split('/', expand=True)

In [8]:
df.drop('Cabin',axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S


In [11]:
df['Deck'] = df['Deck'].fillna('U')

In [12]:
df['Side'] = df['Side'].fillna('U')

In [15]:
df['Deck'] = df['Deck'].map(deck_mapping)

In [16]:
df['Side'] = df['Side'].map(side_mapping)

In [17]:
df.drop('Name',axis=1, inplace=True)

In [18]:
df['Side'].value_counts()

Side
1    6381
2    6290
3     299
Name: count, dtype: int64

In [19]:
impute_list = ['CryoSleep','Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Transported','Num']
rest = ['HomePlanet','Destination','PassengerId','Deck','Side']

In [20]:
df_rest = df[rest]
df_rest.head()

Unnamed: 0,HomePlanet,Destination,PassengerId,Deck,Side
0,Europa,TRAPPIST-1e,0001_01,2,2
1,Earth,TRAPPIST-1e,0002_01,6,1
2,Europa,TRAPPIST-1e,0003_01,1,1
3,Europa,TRAPPIST-1e,0003_02,1,1
4,Earth,TRAPPIST-1e,0004_01,6,1


In [21]:
imp = KNNImputer()

In [22]:
df_imputed = imp.fit_transform(df[impute_list])

In [23]:
df_imputed = pd.DataFrame(df_imputed,columns=impute_list)

In [24]:
df_imputed.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0


In [25]:
new_df = pd.concat([df_imputed.reset_index(drop=True),df_rest.reset_index(drop=True)],axis=1)

In [26]:
new_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,HomePlanet,Destination,PassengerId,Deck,Side
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Europa,TRAPPIST-1e,0001_01,2,2
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,Earth,TRAPPIST-1e,0002_01,6,1
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,Europa,TRAPPIST-1e,0003_01,1,1
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,Europa,TRAPPIST-1e,0003_02,1,1
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,Earth,TRAPPIST-1e,0004_01,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,1.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1496.0,Earth,TRAPPIST-1e,9266_02,7,1
12966,0.0,42.0,0.0,0.0,847.0,17.0,10.0,144.0,0.0,429.2,Earth,TRAPPIST-1e,9269_01,9,3
12967,1.0,22.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,296.0,Mars,55 Cancri e,9271_01,4,2
12968,0.0,30.2,0.0,0.0,2680.0,0.0,0.0,523.0,0.0,297.0,Europa,,9273_01,4,2


In [13]:
deck_mapping = {
    'A' : 1,
    'B' : 2,
    'C' : 3,
    'D' : 4,
    'E' : 5,
    'F' : 6,
    'G' : 7,
    'T' : 8,
    'U' : 9
}

In [14]:
side_mapping = {
    'S' : 1,
    'P' : 2,
    'U' : 3
}

In [28]:
new_df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,HomePlanet,Destination,PassengerId,Deck,Side
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Europa,TRAPPIST-1e,0001_01,2,2
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,Earth,TRAPPIST-1e,0002_01,6,1
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,Europa,TRAPPIST-1e,0003_01,1,1
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,Europa,TRAPPIST-1e,0003_02,1,1
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,Earth,TRAPPIST-1e,0004_01,6,1


In [31]:
new_df.isna().sum()

CryoSleep       0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Num             0
HomePlanet      0
Destination     0
PassengerId     0
Deck            0
Side            0
dtype: int64

In [30]:
new_df['HomePlanet'] = new_df['HomePlanet'].fillna('U')
new_df['Destination'] = new_df['Destination'].fillna('U')

In [46]:
#feature engineering
bill_cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']

In [32]:
new_df['HomePlanet'].value_counts()

HomePlanet
Earth     6865
Europa    3133
Mars      2684
U          288
Name: count, dtype: int64

In [37]:
cols = ['HomePlanet','Destination']

In [41]:
for col in cols:
    new_df = pd.concat([new_df,pd.get_dummies(new_df[col],prefix=col)],axis=1)

In [42]:
new_df

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,...,Deck,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,2,False,True,False,False,False,False,True,False
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,...,6,1,True,False,False,False,False,False,True,False
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,...,1,1,False,True,False,False,False,False,True,False
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,...,1,1,False,True,False,False,False,False,True,False
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,...,6,1,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,1.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1496.0,...,7,1,True,False,False,False,False,False,True,False
12966,0.0,42.0,0.0,0.0,847.0,17.0,10.0,144.0,0.0,429.2,...,9,3,True,False,False,False,False,False,True,False
12967,1.0,22.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,296.0,...,4,2,False,False,True,False,True,False,False,False
12968,0.0,30.2,0.0,0.0,2680.0,0.0,0.0,523.0,0.0,297.0,...,4,2,False,True,False,False,False,False,False,True


In [43]:
new_df.drop(['HomePlanet','Destination'],axis=1,inplace=True)

In [50]:
new_df.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,...,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,TotalBill,std_TotalBill,mean_TotalBill
0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,False,False,True,False,0.0,0.0,0.0
1,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,...,False,False,False,False,False,True,False,736.0,227.807375,147.2
2,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,...,True,False,False,False,False,True,False,10383.0,3013.383198,2076.6
3,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,...,True,False,False,False,False,True,False,5176.0,1373.410427,1035.2
4,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,1.0,1.0,...,False,False,False,False,False,True,False,1091.0,223.988169,218.2


In [47]:
bill_cols

['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [49]:
new_df['TotalBill'] = new_df[bill_cols].sum(axis=1)
new_df['std_TotalBill'] = new_df[bill_cols].std(axis=1)
new_df['mean_TotalBill'] = new_df[bill_cols].mean(axis=1)

In [53]:
new_df.drop(bill_cols,axis=1,inplace=True)

In [54]:
new_df.head()

Unnamed: 0,CryoSleep,Age,VIP,Transported,Num,PassengerId,Deck,Side,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_U,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_U,TotalBill,std_TotalBill,mean_TotalBill
0,0.0,39.0,0.0,0.0,0.0,0001_01,2,2,False,True,False,False,False,False,True,False,0.0,0.0,0.0
1,0.0,24.0,0.0,1.0,0.0,0002_01,6,1,True,False,False,False,False,False,True,False,736.0,227.807375,147.2
2,0.0,58.0,1.0,0.0,0.0,0003_01,1,1,False,True,False,False,False,False,True,False,10383.0,3013.383198,2076.6
3,0.0,33.0,0.0,0.0,0.0,0003_02,1,1,False,True,False,False,False,False,True,False,5176.0,1373.410427,1035.2
4,0.0,16.0,0.0,1.0,1.0,0004_01,6,1,True,False,False,False,False,False,True,False,1091.0,223.988169,218.2


In [55]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CryoSleep                  12970 non-null  float64
 1   Age                        12970 non-null  float64
 2   VIP                        12970 non-null  float64
 3   Transported                12970 non-null  float64
 4   Num                        12970 non-null  float64
 5   PassengerId                12970 non-null  object 
 6   Deck                       12970 non-null  int64  
 7   Side                       12970 non-null  int64  
 8   HomePlanet_Earth           12970 non-null  bool   
 9   HomePlanet_Europa          12970 non-null  bool   
 10  HomePlanet_Mars            12970 non-null  bool   
 11  HomePlanet_U               12970 non-null  bool   
 12  Destination_55 Cancri e    12970 non-null  bool   
 13  Destination_PSO J318.5-22  12970 non-null  boo

In [61]:
new_df.corr()['Transported'].sort_values(ascending=False)

Transported                  1.000000
CryoSleep                    0.325139
3_high_cols                  0.284643
HomePlanet_Europa            0.131977
Destination_55 Cancri e      0.083625
PassengerId                  0.014628
HomePlanet_U                 0.006403
HomePlanet_Mars              0.005643
Destination_PSO J318.5-22    0.000760
Destination_U               -0.000554
VIP                         -0.018721
Num                         -0.037529
Age                         -0.048634
Side                        -0.068138
Destination_TRAPPIST-1e     -0.072731
Deck                        -0.084981
HomePlanet_Earth            -0.119644
std_TotalBill               -0.121670
3_low_cols                  -0.137593
mean_TotalBill              -0.140844
TotalBill                   -0.140844
Name: Transported, dtype: float64

In [59]:
new_df['3_high_cols'] = new_df['CryoSleep'] + new_df['HomePlanet_Europa'] + new_df['Destination_55 Cancri e']

In [60]:
new_df['3_low_cols'] = new_df['TotalBill'] + new_df['mean_TotalBill'] + new_df['std_TotalBill']

## MODEL BUILDING

In [63]:
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier


In [65]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split