# Kaggle Titanic Machine Learning
- source of competition: https://www.kaggle.com/c/titanic
- Data Dictionary: https://www.kaggle.com/c/titanic/data
- useful link for saving to GitHub: https://www.kaggle.com/questions-and-answers/72234

In [1]:
# Importing libraries
%matplotlib inline
import numpy as np 
import pandas as pd 
import pandas_profiling

# Setting Random Seed For Reproducibility
import random
random.seed(123)

# Displaying Max rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

# Listing Files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv') # for final evaluation/submission only

In [3]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Data Wrangling/Cleaning

In [5]:
# Creating train/val/test split prior to transformations (avoid data leakage)

X = df_train.drop(['Survived'],axis = 1)
y = df_train.Survived

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 3) # test set 15% train
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size = 0.15, random_state = 3) #validation set 15% train

In [6]:
for i in [X_train,X_test,X_val]:
    print(i.shape)

(643, 11)
(134, 11)
(114, 11)


# Exploratory Data Analysis

General thoughts based on the profile below
- PassengerId - removing due to ID variable
- Missing values: Age, Cabin, Fare, Embarked
- Correlations in Fare-Class-Age

In [7]:
#combining the train feature/target data for EDA/Data Wrangling

df_train_split = pd.concat([X_train, y_train], axis = 1)
df_train_split.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
387,388,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,,S,1
531,532,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C,0


In [8]:
#making use of the profile package for EDA plots/stats/...

profile = pandas_profiling.ProfileReport(df_train_split, title = "EDA Profile Train Data Report")

HBox(children=(FloatProgress(value=0.0, description='variables', max=13.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=36.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=4.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




In [9]:
profile.to_widgets()

Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…

# Data Wrangling and Feature Engineering
- only on training dataset, will use a pipeline for val/test and final submission test set

In [10]:
# Missing Values Handling
print(df_train_split.Embarked.value_counts())

#Embarked only 1 missing, fill with most common of S, C, Q (will be S)
df_train_split.Embarked = df_train_split.Embarked.fillna(df_train_split.Embarked.value_counts().index[0]) #using value_counts top record

S    475
C    118
Q     49
Name: Embarked, dtype: int64


In [11]:
# dropping passenger id (is an id)
df_train_split.drop(['PassengerId'], axis = 1, inplace = True)

In [12]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto')
feature_array = ohe.fit_transform(df_train_split[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
#feature_labels = ohe.categories_

In [13]:
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
print(features.shape)
features.head()

(643, 22)


Unnamed: 0,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
df_train_split = df_train_split.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)


In [15]:
print(df_train_split.shape)
df_train_split.head()

(643, 6)


Unnamed: 0,Name,Age,Ticket,Fare,Cabin,Survived
387,"Buss, Miss. Kate",36.0,27849,13.0,,1
531,"Toufik, Mr. Nakli",,2641,7.2292,,0
480,"Goodwin, Master. Harold Victor",9.0,CA 2144,46.9,,0
217,"Jacobsohn, Mr. Sidney Samuel",42.0,243847,27.0,,0
799,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",30.0,345773,24.15,,0


In [16]:
df_train_split = pd.concat([df_train_split.reset_index(drop=True),features.reset_index(drop=True)], axis = 1)

In [17]:
df_train_split.shape
df_train_split.head()

Unnamed: 0,Name,Age,Ticket,Fare,Cabin,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S
0,"Buss, Miss. Kate",36.0,27849,13.0,,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,"Toufik, Mr. Nakli",,2641,7.2292,,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,"Goodwin, Master. Harold Victor",9.0,CA 2144,46.9,,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,"Jacobsohn, Mr. Sidney Samuel",42.0,243847,27.0,,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",30.0,345773,24.15,,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [18]:
df_train_split.head()

Unnamed: 0,Name,Age,Ticket,Fare,Cabin,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S
0,"Buss, Miss. Kate",36.0,27849,13.0,,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,"Toufik, Mr. Nakli",,2641,7.2292,,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,"Goodwin, Master. Harold Victor",9.0,CA 2144,46.9,,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,"Jacobsohn, Mr. Sidney Samuel",42.0,243847,27.0,,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",30.0,345773,24.15,,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
df_train_split.shape

(643, 28)

In [20]:
# Encoding the Age Missing Values that are 'S' with the training data median Age 
median_age_train = df_train_split.Age.median()
df_train_split['Age'] = df_train_split['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)
df_train_split.Age.isna().sum()

0

### Cabin Missing Values

In [21]:
#INPROGRESS #Missing Values Cabin - taking the initial value
df_train_split.Cabin = df_train_split[['Cabin']].fillna(value= 'Z')
df_train_split['Cabin_augment'] = df_train_split.Cabin.apply(lambda x : x[0]) # augmenting dataset, only want the first letter (numbers not matter)
df_train_split.Cabin_augment.value_counts()

Z    491
C     44
B     33
D     26
E     24
A     12
F     11
T      1
G      1
Name: Cabin_augment, dtype: int64

In [22]:
df_train_split[['Cabin_augment','Fare']].groupby(['Cabin_augment']).mean().round(2)

Unnamed: 0_level_0,Fare
Cabin_augment,Unnamed: 1_level_1
A,39.35
B,118.79
C,91.22
D,54.26
E,44.0
F,20.22
G,16.7
T,35.5
Z,18.96


In [23]:
from sklearn.preprocessing import OneHotEncoder

ohe_Cabin_augment = OneHotEncoder(categories='auto')
feature_array_Cabin_augment = ohe_Cabin_augment.fit_transform(df_train_split[['Cabin_augment']]).toarray()
#feature_labels = ohe.categories_
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())


In [24]:
df_train_split.drop(['Cabin','Cabin_augment'], axis =1, inplace = True)
df_train_split = pd.concat([df_train_split,features_Cabin_augment], axis = 1)

In [25]:
df_train_split.head()

Unnamed: 0,Name,Age,Ticket,Fare,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S,x0_A,x0_B,x0_C,x0_D,x0_E,x0_F,x0_G,x0_T,x0_Z
0,"Buss, Miss. Kate",36.0,27849,13.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,"Toufik, Mr. Nakli",29.0,2641,7.2292,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,"Goodwin, Master. Harold Victor",9.0,CA 2144,46.9,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,"Jacobsohn, Mr. Sidney Samuel",42.0,243847,27.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",30.0,345773,24.15,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [26]:
df_train_split.drop(['Name','Ticket'], axis = 1, inplace = True)

In [27]:
df_train_split.head()

Unnamed: 0,Age,Fare,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S,x0_A,x0_B,x0_C,x0_D,x0_E,x0_F,x0_G,x0_T,x0_Z
0,36.0,13.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,29.0,7.2292,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,9.0,46.9,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,42.0,27.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,30.0,24.15,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [28]:
# checking that all missing values are taken care of
print(df_train_split.isna().sum().sum())
df_train_split.shape

0


(643, 34)

In [29]:
df_train_split_X = df_train_split.drop(['Survived'],axis = 1)
df_train_split_y = df_train_split[['Survived']]

In [30]:
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
import collections

print('Prior', collections.Counter(np.squeeze(df_train_split_y)))

Prior Counter({0: 393, 1: 250})


In [31]:
# Smote Operation

smote_instance = ADASYN(random_state=0)
X_train_resampled, y_train_resampled = smote_instance.fit_sample(df_train_split_X, df_train_split_y)

print('Post', collections.Counter(np.squeeze(y_train_resampled)))

Post Counter({1: 414, 0: 393})


In [32]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_resampled = sc.fit_transform(X_train_resampled)
X_train_resampled.shape

(807, 33)

In [33]:
# Performing Save Data Wrangling Steps on the Val/Test Data

#combining the train feature/target data for EDA/Data Wrangling
df_val_split = pd.concat([X_val, y_val], axis = 1)
df_val_split.Embarked = df_val_split.Embarked.fillna('S')
df_val_split.drop(['PassengerId'], axis = 1, inplace = True)

feature_array = ohe.transform(df_val_split[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
df_val_split = df_val_split.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)
df_val_split = pd.concat([df_val_split.reset_index(drop=True),features.reset_index(drop=True)], axis = 1)

df_val_split['Age'] = df_val_split['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)

df_val_split.Cabin = df_val_split[['Cabin']].fillna(value= 'Z')
df_val_split['Cabin_augment'] = df_val_split.Cabin.apply(lambda x : x[0])

feature_array_Cabin_augment = ohe_Cabin_augment.transform(df_val_split[['Cabin_augment']]).toarray()
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())

df_val_split.drop(['Cabin','Cabin_augment'], axis =1, inplace = True)
df_val_split = pd.concat([df_val_split,features_Cabin_augment], axis = 1)

df_val_split.drop(['Name','Ticket'], axis = 1, inplace = True)

print(df_val_split.shape)
df_val_split.head()

(114, 34)


Unnamed: 0,Age,Fare,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S,x0_A,x0_B,x0_C,x0_D,x0_E,x0_F,x0_G,x0_T,x0_Z
0,14.0,46.9,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,36.0,10.5,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,9.0,31.3875,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,19.0,7.8792,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,45.0,8.05,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [34]:
df_val_split_X = df_val_split.drop(['Survived'],axis = 1)
df_val_split_y = df_val_split[['Survived']]

In [35]:
df_val_split_X = sc.transform (df_val_split_X)
df_val_split_X.shape

(114, 33)

# Model Developement 

In [36]:
# Baseline Model 
import xgboost as xgb
from sklearn.metrics import accuracy_score

eval_set = [(df_val_split_X,df_val_split_y.values.ravel())]

model_xgb = xgb.XGBClassifier(learning_rate = 0.01)
model_xgb.fit(X_train_resampled, y_train_resampled.values.ravel(), early_stopping_rounds=10, eval_metric="error", eval_set= eval_set,verbose = 0)

print("Training Accuracy:", accuracy_score(model_xgb.predict(X_train_resampled),y_train_resampled))
print("Validation Accuracy:", accuracy_score(model_xgb.predict(df_val_split_X),df_val_split_y))

Training Accuracy: 0.8897149938042132
Validation Accuracy: 0.7631578947368421


In [37]:
# Adding Parameter Tuning
from sklearn.model_selection import GridSearchCV

eval_set = [(df_val_split_X,df_val_split_y.values.ravel())]

param_grid = {
    "learning_rate": [0.1,0.05],
    'max_depth': [2,3,4,5,6],
    'min_child_weight': [1, 2,4,6,8,10],
    'subsample': [0.5, 0.7, 0.9],
    'n_estimators': [5, 30, 100, 250, 500],
}

grid_clf = GridSearchCV(xgb.XGBClassifier() , param_grid, scoring='accuracy', cv=None)
grid_clf.fit(X_train_resampled, y_train_resampled.values.ravel() , early_stopping_rounds=10 , eval_metric="error", eval_set= eval_set,verbose = False)

best_parameters = grid_clf.best_params_

print('Grid Search found the following optimal parameters: ')
for param_name in sorted(best_parameters.keys()):
    print('%s: %r' % (param_name, best_parameters[param_name]))
    
print("Training Accuracy:", accuracy_score(grid_clf.predict(X_train_resampled),y_train_resampled.values.ravel()))
print("Validation Accuracy:", accuracy_score(grid_clf.predict(df_val_split_X),df_val_split_y.values.ravel()))


Grid Search found the following optimal parameters: 
learning_rate: 0.1
max_depth: 6
min_child_weight: 2
n_estimators: 30
subsample: 0.9
Training Accuracy: 0.8971499380421314
Validation Accuracy: 0.7982456140350878


In [38]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_resampled, y_train_resampled.values.ravel())

print("Training Accuracy:", accuracy_score(gnb.predict(X_train_resampled),y_train_resampled))
print("Validation Accuracy:", accuracy_score(gnb.predict(df_val_split_X),df_val_split_y))

Training Accuracy: 0.5390334572490706
Validation Accuracy: 0.43859649122807015


In [39]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(random_state=0, max_iter = 1000).fit(X_train_resampled, y_train_resampled.values.ravel())

print("Training Accuracy:", accuracy_score(clf_log.predict(X_train_resampled),y_train_resampled))
print("Validation Accuracy:", accuracy_score(clf_log.predict(df_val_split_X),df_val_split_y))

Training Accuracy: 0.815365551425031
Validation Accuracy: 0.7543859649122807


# Model Stacking

In [100]:
from warnings import filterwarnings
filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

estimators = [
    ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('cart',DecisionTreeClassifier()),
    ('svr', make_pipeline(LinearSVC(random_state=42))),
    ('svc', SVC(gamma='auto'))]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf_stack = clf.fit(X_train_resampled, y_train_resampled.values.ravel())

print("Training Accuracy:", accuracy_score(clf_stack.predict(X_train_resampled),y_train_resampled.values.ravel()))
print("Validation Accuracy:", accuracy_score(clf_stack.predict(df_val_split_X),df_val_split_y.values.ravel()))

print("Confusion Matrix:\n",confusion_matrix(clf_stack.predict(df_val_split_X),df_val_split_y.values.ravel()))

Training Accuracy: 0.9628252788104089
Validation Accuracy: 0.7807017543859649
Confusion Matrix:
 [[56 11]
 [14 33]]


In [102]:
#gridsearch (next steps)

# estimators = [
#     ('rf', RandomForestClassifier(n_estimators=10, random_state=42)),
#     ('knn', KNeighborsClassifier()),
#     ('cart',DecisionTreeClassifier()),
#     ('svr', make_pipeline(LinearSVC(random_state=42)))]

# sclf = StackingClassifier(estimators= estimators , final_estimator= LogisticRegression()) # =DecisionTreeClassifier())

# # params = {'rf__n_estimators': [5,10,20],
# #           'rf__max_features': [5,10,20],
# #           'rf__max_depth': [1,3,5,7],
# #           'rf__min_samples_leaf': [10,25,50],
# #           'knn__n_neighbors': [3,5,7],
# #           'knn__algorithm':['ball_tree','kd_tree']}

# params = {'rf__n_estimators': [5,10,20],
#           'rf__max_features': [5,10],
#           'rf__max_depth': [3,5,7],
#           'knn__n_neighbors': [3,5],
#           'knn__algorithm':['ball_tree','kd_tree']}

# grid = GridSearchCV(estimator=sclf, param_grid=params, cv=5)
# grid.fit(X_train_resampled, y_train_resampled.values.ravel())


# print("Training Accuracy:", accuracy_score(grid.predict(X_train_resampled),y_train_resampled.values.ravel()))
# print("Validation Accuracy:", accuracy_score(grid.predict(df_val_split_X),df_val_split_y.values.ravel()))

# print("Confusion Matrix:\n",confusion_matrix(grid.predict(df_val_split_X),df_val_split_y.values.ravel()))

# Predicting the test submission data

In [68]:
print(df_test.shape)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
5,897,3,"Svensson, Mr. Johan Cervin",male,14.0,0,0,7538,9.225,,S
6,898,3,"Connolly, Miss. Kate",female,30.0,0,0,330972,7.6292,,Q
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0,,S
8,900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18.0,0,0,2657,7.2292,,C
9,901,3,"Davies, Mr. John Samuel",male,21.0,2,0,A/4 48871,24.15,,S


In [125]:
#transforming the test data like train (make a pipeline later...)

df_test = pd.read_csv('/kaggle/input/titanic/test.csv') # for final evaluation/submission only
df_test_ids = df_test[['PassengerId']] # for creating the csv

df_test.Embarked = df_test.Embarked.fillna('S')
df_test.drop(['PassengerId'], axis = 1, inplace = True)

df_test['Fare'].fillna((df_test['Fare'].mean()), inplace=True) # fare in test has 1 missing value, set to mean
df_test.loc[df_test['Parch'] == 9, 'Parch'] = 0 #9 is contained in Parch only in the test, so setting to most common, cause next ohe transfor to break

feature_array = ohe.transform(df_test[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
df_test = df_test.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)
df_test = pd.concat([df_test.reset_index(drop=True),features.reset_index(drop=True)], axis = 1)

df_test['Age'] = df_test['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)

df_test.Cabin = df_test[['Cabin']].fillna(value= 'Z')
df_test['Cabin_augment'] = df_test.Cabin.apply(lambda x : x[0])

feature_array_Cabin_augment = ohe_Cabin_augment.transform(df_test[['Cabin_augment']]).toarray()
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())

df_test.drop(['Cabin','Cabin_augment'], axis =1, inplace = True)
df_test = pd.concat([df_test,features_Cabin_augment], axis = 1)

df_test.drop(['Name','Ticket'], axis = 1, inplace = True)

df_test = sc.transform(df_test)

print(df_test.shape)

(418, 33)


In [127]:
# predicting using the clf_stack

predications = clf_stack.predict(df_test)

In [133]:
#Creating the sumission object CSV

df_submit = pd.DataFrame(data=np.column_stack((df_test_ids, predications)),columns=['PassengerId','Survived'])
df_submit.to_csv('rad_submission_1_20201209.csv',index=False)

# Potential Next Steps / Changes to Consider
- Potentially use K-Fold Cross validation due to small size 
- Feature engineering (Class x sex), (Class x Parch)
- Add more model types, more hyperparameters
- Add model stacking
- https://alexforrest.github.io/you-might-be-leaking-data-even-if-you-cross-validate.html
- https://machinelearningmastery.com/data-preparation-without-data-leakage/
- http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/