In [31]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as mp
import seaborn as sb
from sklearn.preprocessing import RobustScaler   
from sklearn.model_selection import train_test_split
import itertools as it
import xgboost as xgb
from datetime import datetime
import time
from xgboost import XGBClassifier
from sklearn import metrics

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [32]:
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")

In [33]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


In [34]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [35]:
for x in train_data.columns:
    print ("----------%s" % x)
    print (train_data[x].value_counts())

----------PassengerId
0001_01    1
6136_01    1
6141_01    1
6139_06    1
6139_05    1
          ..
3126_01    1
3124_03    1
3124_02    1
3124_01    1
9280_02    1
Name: PassengerId, Length: 8693, dtype: int64
----------HomePlanet
Earth     4602
Europa    2131
Mars      1759
Name: HomePlanet, dtype: int64
----------CryoSleep
False    5439
True     3037
Name: CryoSleep, dtype: int64
----------Cabin
G/734/S     8
G/109/P     7
B/201/P     7
G/1368/P    7
G/981/S     7
           ..
G/556/P     1
E/231/S     1
G/545/S     1
G/543/S     1
F/947/P     1
Name: Cabin, Length: 6560, dtype: int64
----------Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: Destination, dtype: int64
----------Age
24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: Age, Length: 80, dtype: int64
----------VIP
False    8291
True      199
Name: VIP, dtype: int64
----------RoomService
0.0       5577

In [36]:
all_data = [train_data, test_data]

for data in all_data:
    homeplanet_null_sum = pd.isnull(data['HomePlanet']).sum()
    cryosleep_null_sum = pd.isnull(data['CryoSleep']).sum()
    destination_null_sum = pd.isnull(data['Destination']).sum()
    vip_null_sum = pd.isnull(data['VIP']).sum()
    
    data['HomePlanet'] = data['HomePlanet'].fillna(value='Earth', limit=round(homeplanet_null_sum *
                                                                              round(data.HomePlanet.value_counts().Earth/(data.HomePlanet.value_counts().Earth +
                                                                                                                          data.HomePlanet.value_counts().Europa +
                                                                                                                          data.HomePlanet.value_counts().Mars), 2)))
    data['HomePlanet'] = data['HomePlanet'].fillna(value='Europa', limit=round(homeplanet_null_sum *
                                                                               round(data.HomePlanet.value_counts().Europa/(data.HomePlanet.value_counts().Earth +
                                                                                                                            data.HomePlanet.value_counts().Europa +
                                                                                                                            data.HomePlanet.value_counts().Mars), 2)))
    data['HomePlanet'] = data['HomePlanet'].fillna(value='Mars', limit=round(homeplanet_null_sum *
                                                                             round(data.HomePlanet.value_counts().Mars/(data.HomePlanet.value_counts().Earth +
                                                                                                                        data.HomePlanet.value_counts().Europa +
                                                                                                                        data.HomePlanet.value_counts().Mars), 2)))

    data['CryoSleep'] = data['CryoSleep'].fillna(value=False, limit=round(cryosleep_null_sum *
                                                                            round((data.CryoSleep == False).sum() /
                                                                                  ((data.CryoSleep == False).sum() +
                                                                                  (data.CryoSleep == True).sum()), 2)))
    data['CryoSleep'] = data['CryoSleep'].fillna(value=True, limit=round(cryosleep_null_sum *
                                                                           round((data.CryoSleep == True).sum() /
                                                                                 ((data.CryoSleep == False).sum() +
                                                                                 (data.CryoSleep == True).sum()), 2)))

    data['Destination'] = data['Destination'].fillna(value='TRAPPIST-1e', limit=round(destination_null_sum
                                                                                      * round((data.Destination == 'TRAPPIST-1e').sum()/((data.Destination == 'TRAPPIST-1e').sum() +
                                                                                                                                         (data.Destination == '55 Cancri e').sum() +
                                                                                                                                         (data.Destination == 'PSO J318.5-22').sum()), 2)))
    data['Destination'] = data['Destination'].fillna(value='55 Cancri e', limit=round(destination_null_sum
                                                                                      * round((data.Destination == '55 Cancri e').sum()/((data.Destination == 'TRAPPIST-1e').sum() +
                                                                                                                                         (data.Destination == '55 Cancri e').sum() +
                                                                                                                                         (data.Destination == 'PSO J318.5-22').sum()), 2)))
    data['Destination'] = data['Destination'].fillna(value='PSO J318.5-22', limit=round(destination_null_sum
                                                                                        * round((data.Destination == 'PSO J318.5-22').sum()/((data.Destination == 'TRAPPIST-1e').sum() +
                                                                                                                                             (data.Destination == '55 Cancri e').sum() +
                                                                                                                                             (data.Destination == 'PSO J318.5-22').sum()), 2)))

    data['VIP'] = data['VIP'].fillna(value=False, limit=round(vip_null_sum *
                                                                round((data.VIP == False).sum() /
                                                                      ((data.VIP == False).sum() +
                                                                      (data.VIP == True).sum()), 2)))
    data['VIP'] = data['VIP'].fillna(value=True, limit=round(vip_null_sum *
                                                               round((data.VIP == True).sum() /
                                                                     ((data.VIP == False).sum()+
                                                                      (data.VIP == True).sum()), 2)))
    data['Age'] = data['Age'].fillna(data['Age'].median())
    data['RoomService'] = data['RoomService'].fillna(
        data['RoomService'].median())
    data['FoodCourt'] = data['FoodCourt'].fillna(data['FoodCourt'].median())
    data['ShoppingMall'] = data['ShoppingMall'].fillna(
        data['ShoppingMall'].median())
    data['Spa'] = data['Spa'].fillna(data['Spa'].median())
    data['VRDeck'] = data['VRDeck'].fillna(data['VRDeck'].median())


In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8691 non-null   object 
 2   CryoSleep     8691 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8691 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(2), float64(6), object(6)
memory usage: 832.1+ KB


In [38]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4277 non-null   object 
 2   CryoSleep     4276 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4276 non-null   object 
 5   Age           4277 non-null   float64
 6   VIP           4277 non-null   bool   
 7   RoomService   4277 non-null   float64
 8   FoodCourt     4277 non-null   float64
 9   ShoppingMall  4277 non-null   float64
 10  Spa           4277 non-null   float64
 11  VRDeck        4277 non-null   float64
 12  Name          4183 non-null   object 
dtypes: bool(1), float64(6), object(6)
memory usage: 405.3+ KB


In [39]:
for data in all_data:
    data['HomePlanet'] = data['HomePlanet'].fillna('Earth')
    data['Destination'] = data['Destination'].fillna('TRAPPIST-1e')
    data['CryoSleep'] = data['CryoSleep'].fillna(False)

In [40]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Cabin         8494 non-null   object 
 4   Destination   8693 non-null   object 
 5   Age           8693 non-null   float64
 6   VIP           8693 non-null   bool   
 7   RoomService   8693 non-null   float64
 8   FoodCourt     8693 non-null   float64
 9   ShoppingMall  8693 non-null   float64
 10  Spa           8693 non-null   float64
 11  VRDeck        8693 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(3), float64(6), object(5)
memory usage: 772.6+ KB


In [41]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4277 non-null   object 
 2   CryoSleep     4277 non-null   bool   
 3   Cabin         4177 non-null   object 
 4   Destination   4277 non-null   object 
 5   Age           4277 non-null   float64
 6   VIP           4277 non-null   bool   
 7   RoomService   4277 non-null   float64
 8   FoodCourt     4277 non-null   float64
 9   ShoppingMall  4277 non-null   float64
 10  Spa           4277 non-null   float64
 11  VRDeck        4277 non-null   float64
 12  Name          4183 non-null   object 
dtypes: bool(2), float64(6), object(5)
memory usage: 376.0+ KB


In [42]:
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

for data in all_data:
    data['Cabin'] = data['Cabin'].fillna("U0")
    data['Deck'] = data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    data['Deck'] = data['Deck'].map(deck)
    data['Deck'] = data['Deck'].fillna(0)
    data['Deck'] = data['Deck'].astype(int)
    
    '''data['Total_Spend'] = (  data['RoomService'] 
                        + data['FoodCourt'] 
                        + data['ShoppingMall']
                        + data['Spa'] 
                        + data['VRDeck']  )
    
    data['Any_Spend'] = np.where(data['Total_Spend'] > 0, 1, 0)
    
    data['Spend_Category'] = ''
    data.loc[data['Total_Spend'].between(0, 1, 'left'), 'Spend_Category'] = 0
    data.loc[data['Total_Spend'].between(1, 800, 'both'), 'Spend_Category'] = 1
    data.loc[data['Total_Spend'].between(800, 1200, 'right'), 'Spend_Category'] = 2
    data.loc[data['Total_Spend'].between(1200, 2700, 'right'), 'Spend_Category'] = 3
    data.loc[data['Total_Spend'].between(2700, 100000, 'right'), 'Spend_Category'] = 4
    
    data['Age_Category'] = ''
    data.loc[data['Age'].between(0, 18, 'both'), 'Age_Category'] = 0
    data.loc[data['Age'].between(18, 40, 'right'), 'Age_Category'] = 1
    data.loc[data['Age'].between(40, 60, 'right'), 'Age_Category'] = 2
    data.loc[data['Age'].between(60, 100, 'right'), 'Age_Category'] = 3'''


In [43]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Deck
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning,7
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers,6
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus,3
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter,3
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez,6


In [44]:
train_data.drop(columns=['Cabin', 'Name'], inplace=True)
test_data.drop(columns=['Cabin', 'Name'], inplace=True)

In [45]:
train_data['Transported'] = train_data['Transported'].map({False: 0, True: 1}).astype(int)

In [46]:
for data in all_data:
    data['HomePlanet'] = data['HomePlanet'].map({'Earth': 0, 'Europa': 1, 'Mars': 2}).astype(int)
    data['CryoSleep'] = data['CryoSleep'].map({True: 1, False: 0}).astype(int)
    data['Destination'] = data['Destination'].map({'TRAPPIST-1e': 0, '55 Cancri e': 1, 'PSO J318.5-22': 2}).astype(int)
    data['VIP'] = data['VIP'].map({True: 1, False: 0}).astype(int)

In [47]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   int32  
 2   CryoSleep     8693 non-null   int32  
 3   Destination   8693 non-null   int32  
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   int32  
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   int32  
 12  Deck          8693 non-null   int32  
dtypes: float64(6), int32(6), object(1)
memory usage: 679.3+ KB


In [48]:
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck
0,0013_01,0,1,0,27.0,0,0.0,0.0,0.0,0.0,0.0,7
1,0018_01,0,0,0,19.0,0,0.0,9.0,0.0,2823.0,0.0,6
2,0019_01,1,1,1,31.0,0,0.0,0.0,0.0,0.0,0.0,3
3,0021_01,1,0,0,38.0,0,0.0,6652.0,0.0,181.0,585.0,3
4,0023_01,0,0,0,20.0,0,10.0,0.0,635.0,0.0,0.0,6


In [49]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck
0,0001_01,1,0,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0,2
1,0002_01,0,0,0,24.0,0,109.0,9.0,25.0,549.0,44.0,1,6
2,0003_01,1,0,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0,1
3,0003_02,1,0,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0,1
4,0004_01,0,0,0,16.0,0,303.0,70.0,151.0,565.0,2.0,1,6


In [50]:
X_train = train_data.drop(columns=['Transported', 'PassengerId']).copy()
X_test = test_data.drop(columns=['PassengerId']).copy()

y_train = train_data['Transported'].copy()

In [51]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4277 non-null   int32  
 1   CryoSleep     4277 non-null   int32  
 2   Destination   4277 non-null   int32  
 3   Age           4277 non-null   float64
 4   VIP           4277 non-null   int32  
 5   RoomService   4277 non-null   float64
 6   FoodCourt     4277 non-null   float64
 7   ShoppingMall  4277 non-null   float64
 8   Spa           4277 non-null   float64
 9   VRDeck        4277 non-null   float64
 10  Deck          4277 non-null   int32  
dtypes: float64(6), int32(5)
memory usage: 284.1 KB


In [52]:
st_x= RobustScaler()    
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test)

In [53]:
params = {
    "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
    "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight": [1, 3, 5, 7],
    "gamma": [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
}

In [24]:
allNames = params
combinations = it.product(*(params[Name] for Name in allNames))
comb_params = list(combinations)

In [25]:
scores = []
start_time = timer(None) # timing starts from this point for "start_time" variable
for x in comb_params:
    XGB = XGBClassifier(learning_rate=x[0], max_depth=x[1],  min_child_weight=x[2], gamma=x[3], colsample_bytree=x[4])
    XGB.fit(X_train, y_train)
    scores.append(XGB.score(X_train, y_train))
timer(start_time) 


 Time taken: 0 hours 28 minutes and 57.85 seconds.


In [26]:
best_params = comb_params[scores.index(max(scores))]

In [27]:
best_params

(0.3, 15, 1, 0.0, 0.7)

In [54]:
#XGB = XGBClassifier(learning_rate=best_params[0], max_depth=best_params[1],  min_child_weight=best_params[2], gamma=best_params[3], colsample_bytree=best_params[4])
XGB = XGBClassifier(learning_rate=0.3, max_depth=15,  min_child_weight=1, gamma=0, colsample_bytree=0.7)
XGB.fit(X_train, y_train)
y_pred_test = XGB.predict(X_test)
XGB.score(X_train, y_train)

0.9318992292649259

In [55]:
PRED = []
for x in y_pred_test:
    if x == 0:
        PRED.append("False")
    else:
        PRED.append("True")

In [56]:
Submission = pd.DataFrame({ 'PassengerId': test_data['PassengerId'],
                            'Transported': PRED })
Submission.to_csv("Submission.csv", index=False)