In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib
import matplotlib.pyplot as plt # plotting
%matplotlib inline 
print("matplotlib version: {}". format(matplotlib.__version__))

import seaborn as sns
print("seaborn version: {}". format(sns.__version__))

import sklearn # machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
print("xgboost version: {}". format(xgb.__version__))
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV # cross validation

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# About
This is the first notebook I publish on Kaggle. It's for the Tabular Playground Series - April 2021 "Synthanic". I tried four different models (Logistic Regression, Decision Tree, Random Forest and XGB) with different preprocessing and evolving features. My best public score I got was 0.79341 with a Logistic Regression. Code for analysing the wrong predicions on the validation set and overwriting predicions for all lines with a Cabin is also included. 

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv') #the sample submission predicts everybody survived
df_all = df_train.append(df_test, ignore_index = True) # created because sometimes it is convenient to work on train and test set together

In [None]:
df_train.head()

In [None]:
df_test.head()

# 1 Quick baseline in order to know which score my model must beat 
# -> 0.78505
Note: this is based on previous knowledge of the original Titanic challenge. 

In [None]:
female_passengers = df_train[df_train.Sex == "female"]
print("Female passengers:\n", female_passengers.Survived.value_counts())
male_passengers = df_train[df_train.Sex == "male"]
print("Male passengers: \n", male_passengers.Survived.value_counts())

# pretend that all women survived
survived = (df_test.Sex == "female").astype('uint8')

# create submission file 
#submission = pd.DataFrame({
#    "PassengerId": df_test.PassengerId, 
#    "Survived": survived
#})
#submission.to_csv('submission.csv', index=False)

# 2 EDA (Exploratory Data Analysis)

In [None]:
df_train.describe()

In [None]:
df_train.dtypes

In [None]:
df_train.isna().sum()

In [None]:
df_test.isna().sum()

Column "Cabin" has more than half of the values missing. But the position of the Cabin might still give valuable information on survival, i.e. passengers from higher decks are more likely to reach the lifeboats (-> create feature). Drop column "Ticket" as it has nearly as many unique values as there are rows (and I can't make any sense of it). 
We have to deal with the missing values in "Age", "Fare" and "Embarked". Let's fill it with a median. To check which one let's explore how much the median varies if it is computed overall/ by Sex / by Pclass. 

In [None]:
df_all.groupby(["Sex","Pclass"]).agg({"Age":"mean","Fare":"mean","Embarked":pd.Series.mode})

## 2.1 Deal with missing values 
As shown above the mean varies between age and Plcass groups. So use the mean from the specific subgroup to replace missing values.

In [None]:
# for a Sex-Pclass combination replace the missing Ages/Fares/Embarked with the mean/mode
for sex in ["male","female"]:
    age_by_Pclass = df_all[df_all.Sex == sex].groupby(["Pclass"])["Age"].mean().round(2)
    fare_by_Pclass = df_all[df_all.Sex == sex].groupby(["Pclass"])["Fare"].mean().round(2)
    emb_by_Pclass = df_all[df_all.Sex == sex].groupby(["Pclass"])["Embarked"].agg(pd.Series.mode)

    for i in range(1,4):
        df_all.loc[(df_all.Sex == sex) & (df_all.Pclass == i) & df_all.Age.isna(),"Age"] = age_by_Pclass[i]
        df_all.loc[(df_all.Sex == sex) & (df_all.Pclass == i) & df_all.Fare.isna(),"Fare"] = fare_by_Pclass[i]
        df_all.loc[(df_all.Sex == sex) & (df_all.Pclass == i) & df_all.Embarked.isna(),"Embarked"] = emb_by_Pclass[i]

In [None]:
df_all.isna().sum()

In [None]:
# this is an experiment with target encoding... I'm not convinced by it's usefulness, remove for now
#mean_sex = df_all[0:len(df_train)].groupby(["Sex"])["Survived"].mean()
#mean_deck = df_all[0:len(df_train)].groupby(["Deck"])["Survived"].mean()
#mean_embarked = df_all[0:len(df_train)].groupby(["Embarked"])["Survived"].mean()
#df_all["Sex"] = df_all["Sex"].map(mean_sex)
#df_all["Deck"] = df_all["Deck"].map(mean_deck)
#df_all["Embarked"] = df_all["Embarked"].map(mean_embarked)
#df_all.head()

In [None]:
df_2 = df_all.copy() # make a copy for the seperate deck experiment

## 2.2 Visualize Distribution

In [None]:
fig = plt.figure() # create figure
fsize = (14,7)
ax0 = fig.add_subplot(2, 4, 1) # add subplot 1 (2 rows, 4 columns, first plot)
ax1 = fig.add_subplot(2, 4, 2) 
ax2 = fig.add_subplot(2, 4, 3) 
ax3 = fig.add_subplot(2, 4, 4) 
ax4 = fig.add_subplot(2, 4, 5) 
ax5 = fig.add_subplot(2, 4, 6) 
ax6 = fig.add_subplot(2, 4, 7) 
ax7 = fig.add_subplot(2, 4, 8)

df_train.Survived.hist(figsize=fsize, ax=ax0)
df_train.Pclass.hist(figsize=fsize, ax=ax1)
df_train.Sex.hist(figsize=fsize, ax=ax2)
df_train.Age.hist(figsize=fsize, ax=ax3)
df_train.SibSp.hist(figsize=fsize, ax=ax4)
df_train.Parch.hist(figsize=fsize, ax=ax5)
df_train.Fare.hist(figsize=fsize, ax=ax6)
df_train.Embarked.hist(figsize=fsize, ax=ax7)

ax0.set_title("Survived")
ax1.set_title("Plcass")
ax2.set_title("Sex")
ax3.set_title("Age")
ax4.set_title("SibSp")
ax5.set_title("Parch")
ax6.set_title("Fare")
ax7.set_title("Embarked")
plt.suptitle("Distributions in training data (df_train)", fontsize=14)
plt.show()

In [None]:
# alternative, much shorter version to get the distributions. However this does not include the categorical variables. Could convert them before plotting...
#fsize = (10,12)
#plot_columns = ["Survived","Pclass","Sex", "Age","SibSp","Parch","Fare","Embarked" ]
#df_train[plot_columns].hist(figsize=fsize)
#plt.suptitle("Distributions in training data (df_train)", fontsize=14)
#plt.show()

In [None]:
fig = plt.figure() # create figure
fsize = (14,7)
ax0 = fig.add_subplot(2, 4, 1) # add subplot 1 (2 rows, 4 columns, first plot)
ax1 = fig.add_subplot(2, 4, 2) 
ax2 = fig.add_subplot(2, 4, 3) 
ax3 = fig.add_subplot(2, 4, 4) 
ax4 = fig.add_subplot(2, 4, 5) 
ax5 = fig.add_subplot(2, 4, 6) 
ax6 = fig.add_subplot(2, 4, 7) 
ax7 = fig.add_subplot(2, 4, 8)

df_test.Pclass.hist(figsize=fsize, ax=ax1)
df_test.Sex.hist(figsize=fsize, ax=ax2)
df_test.Age.hist(figsize=fsize, ax=ax3)
df_test.SibSp.hist(figsize=fsize, ax=ax4)
df_test.Parch.hist(figsize=fsize, ax=ax5)
df_test.Fare.hist(figsize=fsize, ax=ax6)
df_test.Embarked.hist(figsize=fsize, ax=ax7)


ax1.set_title("Plcass")
ax2.set_title("Sex")
ax3.set_title("Age")
ax4.set_title("SibSp")
ax5.set_title("Parch")
ax6.set_title("Fare")
ax7.set_title("Embarked")
plt.suptitle("Distributions in testing data (df_test)", fontsize=14)
plt.show()

Most notable difference: In the test set the age distribution is different. So are Pclass and Sex ratio.

In [None]:
fig = plt.figure() # create figure
fsize = (14,7)
ax0 = fig.add_subplot(2, 4, 1) # add subplot 1 (2 rows, 4 columns, first plot)
ax1 = fig.add_subplot(2, 4, 2) 
ax2 = fig.add_subplot(2, 4, 3) 
ax3 = fig.add_subplot(2, 4, 4) 
ax4 = fig.add_subplot(2, 4, 5) 
ax5 = fig.add_subplot(2, 4, 6) 
ax6 = fig.add_subplot(2, 4, 7) 
ax7 = fig.add_subplot(2, 4, 8)

#df_all.Survived.hist(figsize=fsize, ax=ax0)
df_all.Pclass.hist(figsize=fsize, ax=ax1)
df_all.Sex.hist(figsize=fsize, ax=ax2)
df_all.Age.hist(figsize=fsize, ax=ax3)
df_all.SibSp.hist(figsize=fsize, ax=ax4)
df_all.Parch.hist(figsize=fsize, ax=ax5)
df_all.Fare.hist(figsize=fsize, ax=ax6)
df_all.Embarked.hist(figsize=fsize, ax=ax7)

#ax0.set_title("Survived")
ax1.set_title("Plcass")
ax2.set_title("Sex")
ax3.set_title("Age")
ax4.set_title("SibSp")
ax5.set_title("Parch")
ax6.set_title("Fare")
ax7.set_title("Embarked")
plt.suptitle("Distributions in combined data (df_all)", fontsize=14)
plt.show()

In [None]:
# let's have a different look at the fare column
fsize = (20,7)
fig = plt.figure(figsize=fsize) # create figure
ax0 = fig.add_subplot(1, 1, 1) # add subplot 1 (1 row, 1 columns, first plot)
sns.boxplot(x='Fare', data=df_train, ax=ax0)
plt.show()

# 3 Feature Engineering

In [None]:
#df_train[(df_train.Pclass == 1) & (df_train.Embarked == "S")].sort_values(by=["Name"]).head(25)
#df_train[(df_train.SibSp == 8)].sort_values(by=["Name"]).head(25)
df_all.sort_values(by="Name")[5:23]

It seems, that unlike in the original dataset, the synthetic dataset has no "real" family relations. Let's see if it is completely useless to create features like in the original dataset.

Edit: it is not, the final score improved a bit after creating "is_alone" and "family_size".

In [None]:
# create new features
df_all["is_alone"] = 1
df_all.loc[(df_all.SibSp > 0) | (df_all.Parch > 0),["is_alone"]] = 0
df_all["family_size"] = df_all.SibSp + df_all.Parch + 1

In [None]:
df_all["Deck"]= df_all["Cabin"].str[0] # get the Deck from the Cabin number
df_all.loc[df_all.Deck.isna(), "Deck"] = "N" # treat the NaNs as a seperate category, maybe it means something for survival if the deck is unknown
df_all[0:len(df_train)].groupby("Deck").agg({"Deck":"count",'Survived': 'mean'}) # check survival rates per deck

In [None]:
# Decks F, G and T have few values, based on their survival rates I group them. 
df_all.loc[df_all.Deck == "G", "Deck"] = "F"
df_all.loc[df_all.Deck == "T", "Deck"] = "N"

In [None]:
df_all.head()

# 4 Data Preprocessing
Prepare data for modelling. 

For Logistic Regression: One hot encoding of categorical values ("Sex", "Embarked", "Deck"). Scale values to zero mean and same variance.

For Tree based models: One hot encoding of categorical values ("Sex", "Embarked", "Deck").

In [None]:
# drop unused columns
print(df_all.shape)
df_all.drop(columns=["Name","Cabin","Ticket"], inplace=True)    
print(df_all.shape)

In [None]:
# One Hot Encoding
df_all['Sex'].replace(to_replace=['male','female'], value=[1,0],inplace=True)
df_all = pd.get_dummies(df_all, columns= ["Embarked","Deck"])
df_all.head()

In [None]:
# let's bin the features "Age" and "Fare" and add a new binned column for them
no_bins = 6
df_all["Age_bin"] = pd.cut(df_all.Age,no_bins,labels=False)
df_all["Fare_bin"] = pd.cut(df_all.Fare,no_bins,labels=False)
df_all.head()

In [None]:
df_all.columns

In [None]:
# Mean normalization. I use the complete dataset here, but I am not sure yet if this is the correct place to do normalization
# is it better to do it for training, validation and test set seperately?
#columns = ["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked", "Deck", "is_alone", "family_size"]
#columns = ["Pclass","Sex","Age","SibSp","Parch","Fare","is_alone", "family_size","C","Q","S"]
columns = ['Pclass', 'Sex', 'SibSp','Parch', 'is_alone', 'family_size', 'Embarked_C','Embarked_Q', 'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_N', 'Age_bin', 'Fare_bin']
print("columns: ", columns)
columns_n = []  # create columns names for the normalized columns
for i in columns:
    temp = i + "_n"
    columns_n.append(temp)
print("columns_n: ", columns_n)

scaler = StandardScaler() # all features are centered around 0 and have variance in the same order
temp = scaler.fit_transform(df_all[columns])
df_all = df_all.join(pd.DataFrame(data=temp, columns = columns_n)) # add normalized columns to df_all
df_all.head()

# 5 Start Modelling
## 5.1 Setup Validation Scheme
I use holdhout validation as the number of training examples is much larger than the number of features.

In [None]:
# split df_all again in training and testing part
training_data = df_all[df_all.PassengerId<100000] # I dont overwrite df_train, df_train has still the values without encoding/normalization
testing_data = df_all[df_all.PassengerId>=100000]
training_data.head(2)

In [None]:
# split training data and target variable
y = training_data.Survived
X = training_data.drop(columns=["PassengerId","Survived"])

# split training set into training and validation part
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# now I have X_train with 80% of the rows and X_val with 20% of the rows

In [None]:
# quick check that there are a comparable percentage of survivers in the train and validation set, this should have been done by stratify=y
print(sum(y_train)/len(y_train))
print(sum(y_val)/len(y_val))

In [None]:
X_train.columns

## 5.2.1 Train the first model: Logistic Regression

In [None]:
# initially without k-fold cross validation
clf = LogisticRegression()
clf.fit(X_train[columns_n], y_train) # use only the normalized columns for logistic regression
#clf.fit(X_train[limited_features], y_train)
# display the used parameters, regularization is on by default
clf.get_params()

In [None]:
# make a prediction
y_val_hat = clf.predict(X_val[columns_n]) # predicitions, 1dim np array
y_val_hat_pr = clf.predict_proba(X_val[columns_n]) # contains the probabilitys of the predictions 2dim np array

In [None]:
clf.score(X_val[columns_n], y_val) #  this the accuracy: number of correct predictions / number of predictions
#(sum((y_val_hat == 1) & (y_val ==1)) + sum((y_val_hat == 0) & (y_val ==0))) / len(y_val)
# previous: 0.76895
# 0.7687 with one hot encoding
# 0.76655 with Age and Fare as bins
# 0.77245 with Deck
# 0.77 with target encoded and normalized variables(v23)
# 0.77185 with limited features

In [None]:
# alternative method to get the accurarcy score
accuracy_score(y_val, y_val_hat)

In [None]:
# quick check if cross validation would be useful. As the resulting accuracy is not much different, I decide not.

#clf3 = LogisticRegression()
#scores = cross_val_score(clf3, X[columns_n], y, cv=5)
#scores

In [None]:
# find the best parameter for regularization
reg = [0.0001, 0.001, 0.01, 0.1, 1, 10]
result = []
for r in reg:
    print(r)
    clf = LogisticRegression(C=r)
    clf.fit(X_train[columns_n], y_train) # train on training set
    accuracy = clf.score(X_val[columns_n], y_val) # predict with validation set
    print(accuracy)
    result.append(accuracy)

print("\n The best accuracy score is: ", max(result))
index_of_best_score = result.index(max(result))
print(" with C: ", reg[index_of_best_score])

In [None]:
# make a prediction that can be submitted
clf = LogisticRegression(C=0.01)
clf.fit(X[columns_n], y) # retrain on whole dataset
y_hat = clf.predict(testing_data[columns_n]) # predict for test set
y_hat = y_hat.astype(int) # clf.predicts outputs float, which will give 0 score in submission

In [None]:
# check the thetas for each feature
pd.DataFrame({"Variable":X[columns_n].columns, "Weights": clf.coef_.round(2).reshape(-1)})

In [None]:
# create final submission file
submission = pd.DataFrame({
    "PassengerId": df_test.PassengerId, 
    "Survived": y_hat
})
submission.to_csv('submission_logReg.csv', index=False)

# private score on Leaderboard
# 0.79337
# 0.79341 with binned Age and Fare
# 0.79183 using "Deck" and Target Encoding
# 0.79296 using "Deck"
# 0.78307 with limited features
# limited_features = ['Pclass_n', 'Sex_n','Embarked_C_n', 'Embarked_Q_n', 'Embarked_S_n','Deck_A_n', 'Deck_B_n', 'Deck_C_n', 'Deck_D_n', 'Deck_E_n', 'Deck_F_n','Deck_N_n', 'Age_bin_n', 'Fare_bin_n']

### 5.2.2 Try to predict survival of persons with known Cabin seperately

In [None]:
# let's try something: make a 2nd model and train it only for the rows having a "Deck"
mask = df_2.Cabin.isna()
df_2 = df_2[~mask]
df_2

In [None]:
# feature creation and preprocessing
# create new features
df_2["is_alone"] = 1
df_2.loc[(df_2.SibSp > 0) | (df_2.Parch > 0),["is_alone"]] = 0
df_2["family_size"] = df_2.SibSp + df_2.Parch + 1
df_2["Deck"]= df_2["Cabin"].str[0] # get the Deck from the Cabin number
df_2.loc[df_2.Deck.isna(), "Deck"] = "N" # treat the NaNs as a seperate category, maybe it means something for survival if the deck is unknown
#df_2[0:len(df_train)].groupby("Deck").agg({"Deck":"count",'Survived': 'mean'}) # check survival rates per deck
#df_2.loc[df_2.Deck == "G", "Deck"] = "F" # put small groups together
#df_2.loc[df_2.Deck == "T", "Deck"] = "F"
df_2.drop(columns=["Name","Cabin","Ticket"], inplace=True) 
df_2

In [None]:
# preprocessing
df_2['Sex'].replace(to_replace=['male','female'], value=[1,0],inplace=True)
df_2 = pd.get_dummies(df_2, columns= ["Embarked","Deck"])
df_2["Age_bin"] = pd.cut(df_2.Age,no_bins,labels=False)
df_2["Fare_bin"] = pd.cut(df_2.Fare,no_bins,labels=False)
df_2

In [None]:
columns = ['Pclass', 'Sex', 'SibSp','Parch', 'is_alone', 'family_size', 'Embarked_C','Embarked_Q', 'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Age_bin', 'Fare_bin']
print("columns: ", columns)
columns_n = []  # create columns names for the normalized columns
for i in columns:
    temp = i + "_n"
    columns_n.append(temp)
print("columns_n: ", columns_n)

scaler = StandardScaler() # all features are centered around 0 and have variance in the same order
temp = scaler.fit_transform(df_2[columns])
df_2 = df_2.reset_index(drop=True) # reset index before join 
df_2 = df_2.join(pd.DataFrame(data=temp, columns = columns_n)) # add normalized columns to df_2

#df_2

In [None]:
# split data before modelling
training_data_2 = df_2[df_2.PassengerId<100000] # I dont overwrite df_train, df_train has still the values without encoding/normalization
testing_data_2 = df_2[df_2.PassengerId>=100000]
y_2 = training_data_2.Survived
X_2 = training_data_2.drop(columns=["PassengerId","Survived"])
X_train_2, X_val_2, y_train_2, y_val_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42, stratify=y_2)

In [None]:
clf_2 = LogisticRegression()
clf_2.fit(X_train_2[columns_n], y_train_2) # use only the normalized columns for logistic regression
y_val_hat_2 = clf_2.predict(X_val_2[columns_n])
clf_2.score(X_val_2[columns_n], y_val_2) # accuracy score: 0.7560292515948343 - worse than I hoped for

In [None]:
# find the best parameter for regularization
reg = [0.0001, 0.001, 0.01, 0.1, 1, 10]
result = []
for r in reg:
    print(r)
    clf_2 = LogisticRegression(C=r)
    clf_2.fit(X_train_2[columns_n], y_train_2) # train on training set
    accuracy = clf_2.score(X_val_2[columns_n], y_val_2) # predict with validation set
    print(accuracy)
    result.append(accuracy)

print("\n The best accuracy score is: ", max(result))
index_of_best_score = result.index(max(result))
print(" with C: ", reg[index_of_best_score])

In [None]:
# make a prediction that can be submitted
clf_2 = LogisticRegression(C=0.01)
clf_2.fit(X_2[columns_n], y_2) # retrain on whole dataset
y_hat_2 = clf_2.predict(testing_data_2[columns_n]) # predict for test set
y_hat_2 = y_hat_2.astype(int) # clf.predicts outputs float, which will give 0 score in submission

In [None]:
df_subset = pd.DataFrame({"PassengerId": testing_data_2.PassengerId, 
    "Survived": y_hat_2})
df_subset

In [None]:
# create a dataframe with the original predictions from Log Reg and the new ones on the "Cabin subset"
df_both = submission.merge(df_subset, on="PassengerId", how="left")
df_both.columns= ["PassengerId", "y_hat", "y_hat_2"]

df_both

In [None]:
# when there is Nan in y_hat_2 because this row did not have a Cabin, take the value from the original prediction y_hat instead
df_both.loc[df_both.y_hat_2.isna(),"y_hat_2"] = df_both.y_hat
df_both.y_hat_2 = df_both.y_hat_2.astype(int)
print("Number of predictions that differ: ",df_both[df_both.y_hat != df_both.y_hat_2].shape[0])

In [None]:
# create submission file
submission = pd.DataFrame({
    "PassengerId": df_both.PassengerId, 
    "Survived": df_both.y_hat_2
})
submission.to_csv('submission_logReg_2.csv', index=False)

# unfortunatelly it did not help, in contrary it made the private score worse
# 0.79062 vs 0.79296

## 5.3 Second Model: Decision Tree

In [None]:
tree = DecisionTreeClassifier(criterion = "entropy", random_state=0)
tree.fit(X_train[columns],y_train) # use the not normalized columns for tree models
y_val_hat = tree.predict(X_val[columns])
print("Training accuracy: ", tree.score(X_train[columns],y_train))
print("Validation accuracy: ", tree.score(X_val[columns],y_val))
print("---")
print("with cross validation: ", cross_val_score(tree, X[columns], y, cv=5))

In [None]:
X_train[columns]

A tree with default parameters performs badly. Overfits to the training set. 

> columns = ["Pclass","Sex","Age","SibSp","Parch","Fare","is_alone", "family_size","C","Q","S"]

> Training accuracy:  0.994225
> Validation accuracy:  0.6847
> 
> with cross validation:  [0.68588 0.68956 0.68536 0.68344]

Overfitting is reduced when Age and Fare bins are used!

> columns = ["Pclass","Sex","Age_bin","SibSp","Parch","Fare_bin","is_alone", "family_size","C","Q","S"]

> Training accuracy:  0.7806
> Validation accuracy:  0.76045
> 
> with cross validation:  [0.76216 0.76216 0.76132 0.76056]

when "Deck" is used

> columns = ['Pclass', 'Sex', 'SibSp','Parch', 'is_alone', 'family_size', 'Embarked_C','Embarked_Q', 'Embarked_S', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_N', 'Age_bin', 'Fare_bin']
> 
> Training accuracy:  0.7980125
> Validation accuracy:  0.765
> 
> with cross validation:  [0.7638  0.76475 0.7655  0.76455 0.76765]

## 5.4 Third model: Random Forest (first ensemble model using bagging)

In [None]:
forest = RandomForestClassifier(criterion="gini", max_depth=5, min_samples_leaf = 2, random_state=0)
forest.fit(X_train[columns],y_train)
print("Training accuracy: ", forest.score(X_train[columns],y_train))
print("Validation accuracy: ", forest.score(X_val[columns],y_val))

# with default parameters:
# Training accuracy:  0.9938875
# Validation accuracy:  0.72675

# with max_depth=5
# Training accuracy:  0.770725
# Validation accuracy:  0.76895

# with max_depth=5 and min_samples_leaf = 2
# Training accuracy:  0.7707
# Validation accuracy:  0.76885

# using bins
# Training accuracy:  0.7689875
# Validation accuracy:  0.7675

# adding "Deck"
# Training accuracy:  0.7737375
# Validation accuracy:  0.774

In [None]:
# This is comparable to what I got from Logistic Regression. Let's retrain and make a submission.
forest = RandomForestClassifier(criterion="gini", max_depth=5, random_state=0)
forest.fit(X[columns],y)
y_hat = forest.predict(testing_data[columns])
y_hat = y_hat.astype(int)

In [None]:
# create final submission file
submission = pd.DataFrame({
    "PassengerId": df_test.PassengerId, 
    "Survived": y_hat
})
submission.to_csv('submission_forest.csv', index=False)

# private score on Leaderboard
# 0.78037
# 0.77823 with binned Age and Fare
# 0.79058 using "Deck" and Target Encoding
# 0.78380 using "Deck"

## 5.5 4th model: XGBoost (enseble mode using boosting)

In [None]:
X_train[columns].head()

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', missing=None, seed=42)
clf_xgb.fit(X_train[columns], 
        y_train,
        verbose=True,
        ## the next three arguments set up early stopping.
        early_stopping_rounds=10,
        eval_metric='error',
        eval_set=[(X_val[columns], y_val)])

In [None]:
plot_confusion_matrix(clf_xgb, 
                      X_val[columns], 
                      y_val,
                      values_format='d',
                      display_labels=["Drowned", "Survived"]
                     )

In [None]:
# try to improve the classification accuracy with better hyperparameters
# Round 1
#param_grid = {
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.1, 0.01, 0.05],
#     'gamma': [0, 0.25, 1.0],
#     'reg_lambda': [0, 1.0, 10.0]
# }
# Output: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'reg_lambda': 0}
# Round 2
param_grid = {
     'max_depth': [4],
     'learning_rate': [0.1, 0.5, 1],
     'gamma': [0],
     'reg_lambda': [0, 0.5, 1]
 }
# Output: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'reg_lambda': 0}
#To speed up cross validiation, and to further prevent overfitting.
# We are only using a random subset of the data (90%) and are only
# using a random subset of the features (columns) (50%) per tree.
optimal_params = GridSearchCV(
     estimator=xgb.XGBClassifier(objective='binary:logistic', 
                                 seed=42,
                                 subsample=0.9,
                                 colsample_bytree=0.5),
     param_grid=param_grid,
     scoring='accuracy', ## see https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
     verbose=0, # NOTE: If you want to see what Grid Search is doing, set verbose=2
     n_jobs = 10,
     cv = 3
 )
# only uncomment if you want to run grid search again, time consuming!
#optimal_params.fit(X_train[columns], 
#        y_train,
#        verbose=False,
#        early_stopping_rounds=10,
#        eval_metric='error',
#        eval_set=[(X_val[columns], y_val)])
#print(optimal_params.best_params_)

In [None]:
clf_xgb = xgb.XGBClassifier(seed=42,
                        objective='binary:logistic',
                        gamma=0,
                        learning_rate=0.1, 
                        max_depth=4,
                        reg_lambda=0,
                        subsample=0.9,
                        colsample_bytree=0.5)
clf_xgb.fit(X_train[columns], 
            y_train, 
            verbose=True, 
            early_stopping_rounds=10,
            eval_metric='error',
            eval_set=[(X_val[columns], y_val)])

In [None]:
plot_confusion_matrix(clf_xgb, 
                      X_val[columns], 
                      y_val,
                      values_format='d',
                      display_labels=["Drowned", "Survived"]
                     )

Looks like there is no improvement with better hyperparameters.

In [None]:
y_val_hat = clf_xgb.predict(X_val[columns]).astype(int)
# get accuracy score on validation set like for the other models
accuracy_score(y_val, y_val_hat)

In [None]:
# predict for testing data
y_hat = clf_xgb.predict(testing_data[columns]).astype(int)

In [None]:
# create final submission file
submission = pd.DataFrame({
    "PassengerId": df_test.PassengerId, 
    "Survived": y_hat
})
submission.to_csv('submission_xgb.csv', index=False)

# private score on Leaderboard
# 0.7834
# 0.78529 tuned
# 0.77000 with binned Age and Fare
# 0.79090 using "Deck" and Target Encoding
# 0.78630 using "Deck"

In [None]:
submission

# 6 Analysis of wrong predictions

This has been done with the predicitions from logistic regression. Code might have to be adapted when a tree model has been run.

In [None]:
# let's compare the ground truth with the predictions and prediction probabilities on the validation set
df_compare = pd.DataFrame({ 'y_val': y_val, 'y_val_hat': y_val_hat}).reset_index()
df_compare2 = pd.DataFrame(y_val_hat_pr)
df_compare = df_compare.join(df_compare2)
df_compare.head()

In [None]:
# get the rows that have been incorectly classified (predicted) and look at them in the original unnormalized data
index_of_wrong_class = df_compare[df_compare.y_val != y_val_hat]["index"]
df_train.iloc[index_of_wrong_class].head(50)

Sadly, I can't see anything obvious here.