## Libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# modeling library
import sklearn.linear_model                          # linear modeling in scikit-learn

# other model building tools
from sklearn.model_selection import train_test_split # train-test split
from sklearn.metrics import roc_auc_score            # auc score

In [3]:
# Core

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import itertools
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import time
import squarify  
import catboost as cb
from imblearn.over_sampling import SMOTE


# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.metrics import fbeta_score, make_scorer

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostClassifier


## Reading

In [4]:
# reading in the .csv file with pandas
titanic_train    = pd.read_csv('./train.csv')
# importing the testing dataset
titanic_test = pd.read_csv('./test.csv')

In [5]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
print('Sum of nulls:')
titanic_train.isnull().sum()

Sum of nulls:


PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## Merge

In [7]:
# Merge Dataframe
titanic_train['set'] = 'Training'
titanic_test ['set'] = 'Testing'

# concatenating both datasets together for mv and feature engineering
titanic_df = titanic_train.append(other = titanic_test)

# resetting index to avoid problems later in the code
titanic_df.reset_index(drop = False,
                       inplace = True)

### Clean and Fill Nulls

In [8]:
# Create mean imputer
mean_imputer = SimpleImputer(strategy='constant')

titanic_df[['RoomService']] = mean_imputer.fit_transform(titanic_df[['RoomService']])
titanic_df[['FoodCourt']] = mean_imputer.fit_transform(titanic_df[['FoodCourt']])
titanic_df[['ShoppingMall']] = mean_imputer.fit_transform(titanic_df[['ShoppingMall']])
titanic_df[['Spa']] = mean_imputer.fit_transform(titanic_df[['Spa']])
titanic_df[['VRDeck']] = mean_imputer.fit_transform(titanic_df[['VRDeck']])

# Create mean imputer
mean_imputer = SimpleImputer(strategy='mean')

titanic_df[['Age']] = mean_imputer.fit_transform(titanic_df[['Age']])

# Creating total ammenities expenditure
titanic_df['TotalExpenditure']= titanic_df['RoomService']+titanic_df['Spa']+titanic_df['VRDeck']+titanic_df['FoodCourt']+titanic_df['ShoppingMall']

# Make and fill out new Cryosleep
titanic_df['Cryosleep'] = 0

titanic_df.loc[titanic_df['TotalExpenditure'] == 0, 'Cryosleep'] = 1
titanic_df.loc[titanic_df.CryoSleep.astype('str') == 'True', 'Cryosleep'] = 1
titanic_df.loc[titanic_df.CryoSleep.astype('str') == 'False', 'Cryosleep'] = 0

titanic_df['Cryosleep'] = titanic_df['Cryosleep'].astype('bool')
titanic_df['CryoSleep'] = titanic_df['Cryosleep']
titanic_df.drop('Cryosleep',axis=1,inplace=True)

In [9]:
#Drop Name
titanic_df.drop('Name',axis=1,inplace=True)

In [10]:
# Fill Nulls in Amenities based on CryoSleep
titanic_df.loc[titanic_df.CryoSleep == True,['RoomService', 'FoodCourt','ShoppingMall', 'Spa', 'VRDeck']] = 0

In [11]:
# Create Adults and Spending Columns
titanic_df['Adults'] = titanic_df['Age'] >= 13
titanic_df['Adult_and_spending'] = (titanic_df['TotalExpenditure'] > 0) & (titanic_df['Age'] >=13)

In [12]:
# Create mean imputer
mean_imputer = SimpleImputer(strategy='constant')

titanic_df[['RoomService']] = mean_imputer.fit_transform(titanic_df[['RoomService']])
titanic_df[['FoodCourt']] = mean_imputer.fit_transform(titanic_df[['FoodCourt']])
titanic_df[['ShoppingMall']] = mean_imputer.fit_transform(titanic_df[['ShoppingMall']])
titanic_df[['Spa']] = mean_imputer.fit_transform(titanic_df[['Spa']])
titanic_df[['VRDeck']] = mean_imputer.fit_transform(titanic_df[['VRDeck']])

In [13]:
# Fill nulls Cabin
titanic_df['Cabin'] = titanic_df.Cabin.fillna(method='ffill')

In [14]:
# Create mean imputer
mode_imputer = SimpleImputer(strategy='most_frequent')

# imputation
titanic_df[['HomePlanet']] = mode_imputer.fit_transform(titanic_df[['HomePlanet']])
titanic_df[['Destination']] = mode_imputer.fit_transform(titanic_df[['Destination']])
titanic_df[['VIP']] = mode_imputer.fit_transform(titanic_df[['VIP']])
titanic_df.VIP = titanic_df.VIP.astype('bool')

In [15]:
titanic_df['Group_nums'] = titanic_df.PassengerId.apply(lambda x: x.split('_')).apply(lambda x: x[0])
titanic_df['Grouped'] = ((titanic_df['Group_nums'].value_counts() > 1).reindex(titanic_df['Group_nums'])).tolist()
titanic_df['Deck'] = titanic_df.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[0])
titanic_df['Side'] = titanic_df.Cabin.apply(lambda x: str(x).split('/')).apply(lambda x: x[2])
titanic_df['Has_expenses'] = titanic_df['TotalExpenditure'] > 0
titanic_df['Is_Embryo'] = titanic_df['Age'] == 0

In [16]:
titanic_df['Transported'] = titanic_df['Transported'].astype('bool')

In [17]:
print('Sum of nulls:')
titanic_df.isnull().sum()

Sum of nulls:


index                 0
PassengerId           0
HomePlanet            0
CryoSleep             0
Cabin                 0
Destination           0
Age                   0
VIP                   0
RoomService           0
FoodCourt             0
ShoppingMall          0
Spa                   0
VRDeck                0
Transported           0
set                   0
TotalExpenditure      0
Adults                0
Adult_and_spending    0
Group_nums            0
Grouped               0
Deck                  0
Side                  0
Has_expenses          0
Is_Embryo             0
dtype: int64

In [18]:


#Creating variables for food based costs
titanic_df['FoodExpenditure']= titanic_df['RoomService']+titanic_df['FoodCourt']+titanic_df['ShoppingMall']


# log of each expenditure variabl
titanic_df['log_TotalExpenditure'] = np.log(1+titanic_df['TotalExpenditure'])
titanic_df['log_RoomService'] = np.log(1 + titanic_df['RoomService'])
titanic_df['log_Spa'] = np.log(1+titanic_df['Spa'])
titanic_df['log_VRDeck'] = np.log(1+titanic_df['VRDeck'])
titanic_df['log_FoodCourt'] = np.log(1+titanic_df['FoodCourt'])
titanic_df['log_ShoppingMall'] = np.log(1+titanic_df['ShoppingMall'])
                                              
                                              

#Creating bins for Ages
# New features - training set
titanic_df['Age_group']=np.nan
titanic_df.loc[titanic_df['Age']<=9,'Age_group']=1
titanic_df.loc[(titanic_df['Age']>9) & (titanic_df['Age']<=17),'Age_group']=2
titanic_df.loc[(titanic_df['Age']>17) & (titanic_df['Age']<=23),'Age_group']=3
titanic_df.loc[(titanic_df['Age']>23) & (titanic_df['Age']<=30),'Age_group']=4
titanic_df.loc[(titanic_df['Age']>30) & (titanic_df['Age']<=40),'Age_group']=5
titanic_df.loc[(titanic_df['Age']>40) & (titanic_df['Age']<=50),'Age_group']=6
titanic_df.loc[titanic_df['Age']>50,'Age_group']=7


#Creating bins for Log Amentinies
# New features - training set
titanic_df['log_TotalExpenditure_null']=np.nan
titanic_df.loc[titanic_df['log_TotalExpenditure']<=3,'log_TotalExpenditure_null']=1
titanic_df.loc[(titanic_df['log_TotalExpenditure']>3) & (titanic_df['log_TotalExpenditure']<=6),'log_TotalExpenditure_null']=2
titanic_df.loc[(titanic_df['log_TotalExpenditure']>6) & (titanic_df['log_TotalExpenditure']<=8),'log_TotalExpenditure_null']=3
titanic_df.loc[titanic_df['log_TotalExpenditure']>8,'log_TotalExpenditure_null']=4




#FE with room service and food court
titanic_df['RoomService_to_FoodCourt'] = titanic_df['RoomService']/titanic_df['FoodCourt']

#FE with CryoSleep and total expenditure
titanic_df['cryosleep_totalexpenditure'] = titanic_df['CryoSleep']*titanic_df['TotalExpenditure']

#FE with CryoSleep and Room Service
titanic_df['cryosleep_roomservice'] = titanic_df['CryoSleep']*titanic_df['RoomService']


#FE with CryoSleep and Room Service
titanic_df['cryosleep_roomservice'] = titanic_df['CryoSleep']*titanic_df['RoomService']

#FE with Total Expenditure and Shopping Mall
titanic_df['ShoppingMall_TotalExpenditure'] = titanic_df['ShoppingMall']*titanic_df['TotalExpenditure']

#FE with Log Total Expenditure and FoodCourt
titanic_df['LogTotalExpenditure_FoodCourt'] = titanic_df['FoodCourt']*titanic_df['log_TotalExpenditure']

In [19]:
titanic_df['Group_nums'] = titanic_df['Group_nums'].astype('float')
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   index                          12970 non-null  int64  
 1   PassengerId                    12970 non-null  object 
 2   HomePlanet                     12970 non-null  object 
 3   CryoSleep                      12970 non-null  bool   
 4   Cabin                          12970 non-null  object 
 5   Destination                    12970 non-null  object 
 6   Age                            12970 non-null  float64
 7   VIP                            12970 non-null  bool   
 8   RoomService                    12970 non-null  float64
 9   FoodCourt                      12970 non-null  float64
 10  ShoppingMall                   12970 non-null  float64
 11  Spa                            12970 non-null  float64
 12  VRDeck                         12970 non-null 

## Split DataSets

In [20]:
# Split the DataFrame into two based on the 'fruit' column
titanic_train = titanic_df[titanic_df['set'] == 'Training']
titanic_test = titanic_df[titanic_df['set'] == 'Testing']

In [21]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8693 entries, 0 to 8692
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   index                          8693 non-null   int64  
 1   PassengerId                    8693 non-null   object 
 2   HomePlanet                     8693 non-null   object 
 3   CryoSleep                      8693 non-null   bool   
 4   Cabin                          8693 non-null   object 
 5   Destination                    8693 non-null   object 
 6   Age                            8693 non-null   float64
 7   VIP                            8693 non-null   bool   
 8   RoomService                    8693 non-null   float64
 9   FoodCourt                      8693 non-null   float64
 10  ShoppingMall                   8693 non-null   float64
 11  Spa                            8693 non-null   float64
 12  VRDeck                         8693 non-null   f

In [22]:
# instantiating a correlation matrix
titanic_corr = titanic_train.corr(method = 'pearson').round(decimals = 2)

# transforming correlations to absolute values
titanic_corr.loc[ : , 'Transported' ].apply(func = abs).sort_values(ascending = False)

Transported                      1.00
Has_expenses                     0.48
Adult_and_spending               0.48
log_TotalExpenditure             0.47
CryoSleep                        0.47
log_TotalExpenditure_null        0.45
log_Spa                          0.36
log_RoomService                  0.36
log_VRDeck                       0.34
RoomService                      0.24
Spa                              0.22
VRDeck                           0.20
TotalExpenditure                 0.20
log_ShoppingMall                 0.18
log_FoodCourt                    0.14
Adults                           0.13
Grouped                          0.11
Is_Embryo                        0.09
Age_group                        0.08
RoomService_to_FoodCourt         0.07
Age                              0.07
FoodCourt                        0.05
LogTotalExpenditure_FoodCourt    0.04
FoodExpenditure                  0.04
VIP                              0.04
Group_nums                       0.02
index       

In [23]:
titanic_df['Group_nums'] = titanic_df['Group_nums'].astype('float')
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   index                          12970 non-null  int64  
 1   PassengerId                    12970 non-null  object 
 2   HomePlanet                     12970 non-null  object 
 3   CryoSleep                      12970 non-null  bool   
 4   Cabin                          12970 non-null  object 
 5   Destination                    12970 non-null  object 
 6   Age                            12970 non-null  float64
 7   VIP                            12970 non-null  bool   
 8   RoomService                    12970 non-null  float64
 9   FoodCourt                      12970 non-null  float64
 10  ShoppingMall                   12970 non-null  float64
 11  Spa                            12970 non-null  float64
 12  VRDeck                         12970 non-null 

In [24]:
titanic_df['Group_nums']

0           1.0
1           2.0
2           3.0
3           3.0
4           4.0
          ...  
12965    9266.0
12966    9269.0
12967    9271.0
12968    9273.0
12969    9277.0
Name: Group_nums, Length: 12970, dtype: float64

## Gradient Boosting

In [25]:
features = ['HomePlanet',
            'CryoSleep',
#            'Cabin',
            'Destination',
            'Age',
            'VIP',
            'RoomService',
            'FoodCourt',
            'ShoppingMall',
            'Spa',
            'VRDeck',
            'TotalExpenditure',
            'Adults',
            'Adult_and_spending',
            'Group_nums',
            'Grouped',
            'Deck',
            'Side',
            'Has_expenses',
            'Is_Embryo',
            'FoodExpenditure',
            'log_TotalExpenditure',
            'log_RoomService',
            'log_Spa',
            'log_VRDeck',
            'log_FoodCourt',
            'log_ShoppingMall',
            'Age_group',
            'log_TotalExpenditure_null',
            'cryosleep_totalexpenditure',
            'cryosleep_roomservice',
            'ShoppingMall_TotalExpenditure',
            'LogTotalExpenditure_FoodCourt'
           ]           

In [26]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8693 entries, 0 to 8692
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   index                          8693 non-null   int64  
 1   PassengerId                    8693 non-null   object 
 2   HomePlanet                     8693 non-null   object 
 3   CryoSleep                      8693 non-null   bool   
 4   Cabin                          8693 non-null   object 
 5   Destination                    8693 non-null   object 
 6   Age                            8693 non-null   float64
 7   VIP                            8693 non-null   bool   
 8   RoomService                    8693 non-null   float64
 9   FoodCourt                      8693 non-null   float64
 10  ShoppingMall                   8693 non-null   float64
 11  Spa                            8693 non-null   float64
 12  VRDeck                         8693 non-null   f

In [27]:
pd.set_option("display.max_columns", 50)

In [28]:
titanic_train

Unnamed: 0,index,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,set,TotalExpenditure,Adults,Adult_and_spending,Group_nums,Grouped,Deck,Side,Has_expenses,Is_Embryo,FoodExpenditure,log_TotalExpenditure,log_RoomService,log_Spa,log_VRDeck,log_FoodCourt,log_ShoppingMall,Age_group,log_TotalExpenditure_null,RoomService_to_FoodCourt,cryosleep_totalexpenditure,cryosleep_roomservice,ShoppingMall_TotalExpenditure,LogTotalExpenditure_FoodCourt
0,0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,Training,0.0,True,False,1.0,False,B,P,False,False,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.0,1.0,,0.0,0.0,0.0,0.000000
1,1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,Training,736.0,True,True,2.0,False,F,S,True,False,143.0,6.602588,4.700480,6.309918,3.806662,2.302585,3.258097,4.0,3.0,12.111111,0.0,0.0,18400.0,59.423291
2,2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,Training,10383.0,True,True,3.0,True,A,S,True,False,3619.0,9.248021,3.784190,8.812248,3.912023,8.182280,0.000000,7.0,4.0,0.012025,0.0,0.0,0.0,33070.924666
3,3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,Training,5176.0,True,True,3.0,True,A,S,True,False,1654.0,8.551981,0.000000,8.110728,5.267858,7.157735,5.918894,5.0,4.0,0.000000,0.0,0.0,1920296.0,10972.191645
4,4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,Training,1091.0,True,True,4.0,False,F,S,True,False,524.0,6.995766,5.717028,6.338594,1.098612,4.262680,5.023881,2.0,3.0,4.328571,0.0,0.0,164741.0,489.703631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,Training,8536.0,True,True,9276.0,False,A,P,True,False,6819.0,9.052165,0.000000,7.404888,4.317488,8.827615,0.000000,6.0,4.0,0.000000,0.0,0.0,0.0,61726.712705
8689,8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,Training,0.0,True,False,9278.0,False,G,S,False,False,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.0,1.0,,0.0,0.0,0.0,0.000000
8690,8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,Training,1873.0,True,True,9279.0,False,G,S,True,False,1872.0,7.535830,0.000000,0.693147,0.000000,0.000000,7.535297,4.0,3.0,,0.0,0.0,3506256.0,0.000000
8691,8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,Training,4637.0,True,True,9280.0,True,E,S,True,False,1049.0,8.442039,0.000000,5.869297,8.082093,6.956545,0.000000,5.0,4.0,0.000000,0.0,0.0,0.0,8855.698405


In [29]:
X = pd.get_dummies(titanic_train[features])
y = titanic_train['Transported']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

In [30]:
X

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpenditure,Adults,Adult_and_spending,Group_nums,Grouped,Has_expenses,Is_Embryo,FoodExpenditure,log_TotalExpenditure,log_RoomService,log_Spa,log_VRDeck,log_FoodCourt,log_ShoppingMall,Age_group,log_TotalExpenditure_null,cryosleep_totalexpenditure,cryosleep_roomservice,ShoppingMall_TotalExpenditure,LogTotalExpenditure_FoodCourt,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Side_P,Side_S
0,False,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,True,False,1.0,False,False,False,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5.0,1.0,0.0,0.0,0.0,0.000000,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0
1,False,24.0,False,109.0,9.0,25.0,549.0,44.0,736.0,True,True,2.0,False,True,False,143.0,6.602588,4.700480,6.309918,3.806662,2.302585,3.258097,4.0,3.0,0.0,0.0,18400.0,59.423291,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1
2,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,True,True,3.0,True,True,False,3619.0,9.248021,3.784190,8.812248,3.912023,8.182280,0.000000,7.0,4.0,0.0,0.0,0.0,33070.924666,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1
3,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,True,True,3.0,True,True,False,1654.0,8.551981,0.000000,8.110728,5.267858,7.157735,5.918894,5.0,4.0,0.0,0.0,1920296.0,10972.191645,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,1
4,False,16.0,False,303.0,70.0,151.0,565.0,2.0,1091.0,True,True,4.0,False,True,False,524.0,6.995766,5.717028,6.338594,1.098612,4.262680,5.023881,2.0,3.0,0.0,0.0,164741.0,489.703631,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,41.0,True,0.0,6819.0,0.0,1643.0,74.0,8536.0,True,True,9276.0,False,True,False,6819.0,9.052165,0.000000,7.404888,4.317488,8.827615,0.000000,6.0,4.0,0.0,0.0,0.0,61726.712705,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0
8689,True,18.0,False,0.0,0.0,0.0,0.0,0.0,0.0,True,False,9278.0,False,False,False,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.0,1.0,0.0,0.0,0.0,0.000000,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
8690,False,26.0,False,0.0,0.0,1872.0,1.0,0.0,1873.0,True,True,9279.0,False,True,False,1872.0,7.535830,0.000000,0.693147,0.000000,0.000000,7.535297,4.0,3.0,0.0,0.0,3506256.0,0.000000,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1
8691,False,32.0,False,0.0,1049.0,0.0,353.0,3235.0,4637.0,True,True,9280.0,True,True,False,1049.0,8.442039,0.000000,5.869297,8.082093,6.956545,0.000000,5.0,4.0,0.0,0.0,0.0,8855.698405,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1


In [31]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

gbc = GradientBoostingClassifier()

parameters = {
    "n_estimators":[5,50,100],
    "max_depth":[1,3,5],
    "learning_rate":[0.01,0.1,1]
}

cv = RandomizedSearchCV(gbc, parameters, n_iter=25, scoring='accuracy', n_jobs=-1, cv=5, random_state=1)
cv.fit(X, y)
cv.best_params_



{'n_estimators': 5, 'max_depth': 5, 'learning_rate': 0.01}

In [32]:
gbc1 = GradientBoostingClassifier(n_estimators=100,
                                  min_samples_split=10,
                                  min_samples_leaf=4,
                                  max_features='sqrt', 
                                  max_depth= 7, 
                                  learning_rate=0.1) #best params from gscv

model_fit = gbc1.fit(X,y)

# PREDICTING on the response variable
model_train_pred = model_fit.predict(X_train)
model_valid_pred = model_fit.predict(X_test)

# SCORING the results (accuracy)
model_train_score = gbc1.score(X_train, y_train).round(4) # training accuracy
model_valid_score = gbc1.score(X_test, y_test).round(4) # validation accuracy

# SCORING the results (auc)
model_train_auc = roc_auc_score(y_true  = y_train,
                                y_score = model_train_pred).round(decimals = 4)

model_valid_auc = roc_auc_score(y_true  = y_test,
                                y_score = model_valid_pred).round(decimals = 4)

# displaying results
print('Training Accuracy:  ', model_train_score)
print('Validation Accuracy:', model_valid_score)
print('Training AUC:       ', model_train_auc)
print('Validation AUC:     ', model_valid_auc)


Training Accuracy:   0.8985
Validation Accuracy: 0.9011
Training AUC:        0.8984
Validation AUC:      0.9005


In [33]:
pred_y_gbr2 = gbc1.predict(pd.get_dummies((titanic_test[features])))

In [34]:
gbc_out = pd.DataFrame({'PassengerId':titanic_test.PassengerId, 'Transported':pred_y_gbr2})
gbc_out.to_csv('SubmissionV1.csv',index=False)