# Kaggle Titanic Machine Learning
- source of competition: https://www.kaggle.com/c/titanic
- Data Dictionary: https://www.kaggle.com/c/titanic/data
- useful link for saving to GitHub: https://www.kaggle.com/questions-and-answers/72234

In [129]:
# Importing libraries
%matplotlib inline
import numpy as np 
import pandas as pd 
import pandas_profiling

# Setting Random Seed For Reproducibility
import random
random.seed(123)

# Displaying Max rows
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 100)

# Listing Files
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/train.csv


In [130]:
df_train = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv') # for final evaluation/submission only

In [131]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [132]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# Data Wrangling/Cleaning

In [133]:
# Creating train/val/test split prior to transformations (avoid data leakage)

X = df_train.drop(['Survived'],axis = 1)
y = df_train.Survived

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 3) # test set 15% train
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size = 0.15, random_state = 3) #validation set 15% train

In [134]:
for i in [X_train,X_test,X_val]:
    print(i.shape)

(643, 11)
(134, 11)
(114, 11)


# Exploratory Data Analysis

General thoughts based on the profile below
- PassengerId - removing due to ID variable
- Missing values: Age, Cabin, Fare, Embarked
- Correlations in Fare-Class-Age

In [135]:
#combining the train feature/target data for EDA/Data Wrangling

df_train_split = pd.concat([X_train, y_train], axis = 1)
df_train_split.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
387,388,2,"Buss, Miss. Kate",female,36.0,0,0,27849,13.0,,S,1
531,532,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C,0


In [108]:
#making use of the profile package for EDA plots/stats/...

profile = pandas_profiling.ProfileReport(df_train_split, title = "EDA Profile Train Data Report")

HBox(children=(FloatProgress(value=0.0, description='variables', max=13.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', max=36.0, style=ProgressStyle…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=4.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…




In [109]:
profile.to_widgets()

Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(value='Number of va…

# Data Wrangling and Feature Engineering
- only on training dataset, will use a pipeline for val/test and final submission test set

In [136]:
# Missing Values Handling
print(df_train_split.Embarked.value_counts())

#Embarked only 1 missing, fill with most common of S, C, Q (will be S)
df_train_split.Embarked = df_train_split.Embarked.fillna(df_train_split.Embarked.value_counts().index[0]) #using value_counts top record

S    475
C    118
Q     49
Name: Embarked, dtype: int64


In [137]:
# dropping passenger id (is an id)
df_train_split.drop(['PassengerId'], axis = 1, inplace = True)

In [138]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto')
feature_array = ohe.fit_transform(df_train_split[['Parch','Pclass','Sex','SibSp','Embarked']]).toarray()
#feature_labels = ohe.categories_

In [139]:
features = pd.DataFrame(feature_array, columns=ohe.get_feature_names())
features.shape

(643, 22)

In [140]:
df_train_split = df_train_split.drop(['Parch','Pclass','Sex','SibSp','Embarked'], axis = 1)


In [141]:
df_train_split = pd.concat([df_train_split,features], axis = 1)

In [143]:
df_train_split.head()

Unnamed: 0,Name,Age,Ticket,Fare,Cabin,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S
0,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25,,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,C85,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1,C123,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,"Allen, Mr. William Henry",35.0,373450,8.05,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [144]:
# # Encoding categorical features (will be used to impute Age/Cabin missing values as possible)

# df_train_split = pd.get_dummies(df_train_split, columns=['Parch','Pclass','Sex','SibSp','Embarked'])
# df_train_split

In [145]:
# Encoding the Age Missing Values that are 'S' with the training data median Age 
median_age_train = df_train_split.Age.median()
df_train_split['Age'] = df_train_split['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)
df_train_split.Age.isna().sum()

0

### Cabin Missing Values

In [146]:
#INPROGRESS #Missing Values Cabin - taking the initial value
df_train_split.Cabin = df_train_split[['Cabin']].fillna(value= 'Z')
df_train_split['Cabin_augment'] = df_train_split.Cabin.apply(lambda x : x[0]) # augmenting dataset, only want the first letter (numbers not matter)
df_train_split.Cabin_augment.value_counts()

Z    681
C     44
B     33
D     26
E     24
A     12
F     11
G      1
T      1
Name: Cabin_augment, dtype: int64

In [147]:
df_train_split[['Cabin_augment','Fare']].groupby(['Cabin_augment']).mean().round(2)

Unnamed: 0_level_0,Fare
Cabin_augment,Unnamed: 1_level_1
A,39.35
B,118.79
C,91.22
D,54.26
E,44.0
F,20.22
G,16.7
T,35.5
Z,18.96


In [148]:
from sklearn.preprocessing import OneHotEncoder

ohe_Cabin_augment = OneHotEncoder(categories='auto')
feature_array_Cabin_augment = ohe_Cabin_augment.fit_transform(df_train_split[['Cabin_augment']]).toarray()
#feature_labels = ohe.categories_
features_Cabin_augment = pd.DataFrame(feature_array_Cabin_augment, columns=ohe_Cabin_augment.get_feature_names())


In [149]:
df_train_split.drop(['Cabin'], axis =1, inplace = True)
df_train_split = pd.concat([df_train_split,features_Cabin_augment], axis = 1)

In [150]:
df_train_split.head()

Unnamed: 0,Name,Age,Ticket,Fare,Survived,x0_0,x0_1,x0_2,x0_3,x0_4,x0_5,x0_6,x1_1,x1_2,x1_3,x2_female,x2_male,x3_0,x3_1,x3_2,x3_3,x3_4,x3_5,x3_8,x4_C,x4_Q,x4_S,Cabin_augment,x0_A,x0_B,x0_C,x0_D,x0_E,x0_F,x0_G,x0_T,x0_Z
0,"Braund, Mr. Owen Harris",22.0,A/5 21171,7.25,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,PC 17599,71.2833,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,C,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Heikkinen, Miss. Laina",26.0,STON/O2. 3101282,7.925,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,113803,53.1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,C,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Allen, Mr. William Henry",35.0,373450,8.05,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Z,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [151]:
# df_train_split = pd.get_dummies(df_train_split, columns=['Cabin_augment'])

In [152]:
# df_train_split.head()

In [153]:
df_train_split.drop(['Name','Ticket'], axis = 1, inplace = True)

KeyError: "['Cabin'] not found in axis"

In [28]:
df_train_split.head()

Unnamed: 0,Age,Fare,Survived,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Embarked_C,Embarked_Q,Embarked_S,Cabin_augment_A,Cabin_augment_B,Cabin_augment_C,Cabin_augment_D,Cabin_augment_E,Cabin_augment_F,Cabin_augment_G,Cabin_augment_T,Cabin_augment_Z
387,36.0,13.0,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
531,29.0,7.2292,0,1,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
480,9.0,46.9,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
217,42.0,27.0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
799,30.0,24.15,0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [29]:
# checking that all missing values are taken care of
df_train_split.isna().sum().sum()

0

In [30]:
df_train_split_X = df_train_split.drop(['Survived'],axis = 1)
df_train_split_y = df_train_split[['Survived']]

In [57]:
# Performing Save Data Wrangling Steps on the Val/Test Data

#combining the train feature/target data for EDA/Data Wrangling

df_val_split = pd.concat([X_val, y_val], axis = 1)
df_val_split.Embarked = df_val_split.Embarked.fillna('S') #using value_counts top record df_val_split.Embarked.value_counts()
df_val_split.drop(['PassengerId'], axis = 1, inplace = True)
df_val_split = pd.get_dummies(df_val_split, columns=['Parch','Pclass','Sex','SibSp','Embarked'])
df_val_split['Age'] = df_val_split['Age'].apply(lambda x : median_age_train if pd.isnull(x) else x)

df_val_split.Cabin = df_val_split[['Cabin']].fillna(value= 'Z')
df_val_split['Cabin_augment'] = df_val_split.Cabin.apply(lambda x : x[0])

df_val_split = pd.get_dummies(df_val_split, columns=['Cabin_augment'])
df_val_split.drop(['Name','Ticket','Cabin'], axis = 1, inplace = True)

print(df_val_split.isna().sum().sum())
df_val_split.head()

0


Unnamed: 0,Age,Fare,Survived,Parch_0,Parch_1,Parch_2,Parch_4,Parch_5,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,SibSp_2,SibSp_3,SibSp_4,SibSp_5,SibSp_8,Embarked_C,Embarked_Q,Embarked_S,Cabin_augment_A,Cabin_augment_B,Cabin_augment_C,Cabin_augment_D,Cabin_augment_E,Cabin_augment_F,Cabin_augment_G,Cabin_augment_Z
683,14.0,46.9,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
265,36.0,10.5,0,1,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
182,9.0,31.3875,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1
44,19.0,7.8792,1,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
338,45.0,8.05,1,1,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1


In [58]:
df_val_split_X = df_val_split.drop(['Survived'],axis = 1)
df_val_split_y = df_val_split[['Survived']]

# Model Developement 

In [60]:
# Baseline Model 
import xgboost as xgb

model_xgb = xgb.XGBClassifier(learning_rate = 0.01)
model_xgb.fit(df_train_split_X, df_train_split_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [63]:
from sklearn.metrics import accuracy_score

print("Training Accuracy:", accuracy_score(model_xgb.predict(df_train_split_X),df_train_split_y))
print("Validation Accuracy:", accuracy_score(model_xgb.predict(df_val_split_X),df_val_split_y))

Training Accuracy: 0.8880248833592534


In [64]:
accuracy_score(model_xgb.predict(df_val_split_X),df_val_split_y)

ValueError: feature_names mismatch: ['Age', 'Fare', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_augment_A', 'Cabin_augment_B', 'Cabin_augment_C', 'Cabin_augment_D', 'Cabin_augment_E', 'Cabin_augment_F', 'Cabin_augment_G', 'Cabin_augment_T', 'Cabin_augment_Z'] ['Age', 'Fare', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_4', 'Parch_5', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Cabin_augment_A', 'Cabin_augment_B', 'Cabin_augment_C', 'Cabin_augment_D', 'Cabin_augment_E', 'Cabin_augment_F', 'Cabin_augment_G', 'Cabin_augment_Z']
expected Cabin_augment_T, Parch_6, Parch_3 in input data

# Potential Next Steps / Changes to Consider
- Potentially use K-Fold Cross validation due to small size 
- https://alexforrest.github.io/you-might-be-leaking-data-even-if-you-cross-validate.html
- https://machinelearningmastery.com/data-preparation-without-data-leakage/

In [None]:
# # Based on the values of S (missing) compared to the distribution, should predict which are in cabins as well
# df_train_split[['Cabin_aug','Fare']][df_train_split['Cabin_aug'].str.contains("S")].hist(bins=50) 

# df_train_split['Cabin_aug'] = df_train_split['Cabin_aug'].apply(lambda x: None if x == 'S' else x) #setting to Nan

# df_train_split.head(5)
# test = df_train_split[['Cabin','Cabin_aug']]
# test

# test["Cabin_aug_code"] = test["Cabin_aug"].astype('category')
# test["Cabin_aug_code"] = test["Cabin_aug_code"].cat.codes
# test["Cabin_aug_code"] = test["Cabin_aug_code"].apply(lambda row: np.nan if row == -1 else row)
# test.head(10)

# df_train_split.drop(['Name','Ticket', 'Cabin'], axis =1).isna().sum()

# df_knn = pd.concat([df_train_split['Fare'],test['Cabin_aug_code']], axis = 1)
# df_knn.head(20)

# # in progress 
# from sklearn.impute import KNNImputer # applying https://chrisalbon.com/machine_learning/preprocessing_structured_data/imputing_missing_class_labels_using_k-nearest_neighbors/

# imputer = KNNImputer(n_neighbors= 3)
# df_filled = imputer.fit_transform(df_knn)
# pd.DataFrame(df_filled)