# Intro

**Introduction On September 27 1994 the ferry Estonia set sail on a night voyage across the Baltic Sea from the port of Tallin in Estonia to Stockholm. She departed at 19.00 carrying 989 passengers and crew, as well as vehicles, and was due to dock at 09.30 the following morning, Tragically, the Estonia never arrived.**



In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


import os
print(os.listdir("../input"))

In [None]:
data = pd.read_csv("../input/passenger-list-for-the-estonia-ferry-disaster/estonia-passenger-list.csv")

In [None]:
data.head()

# Visualize the data

In [None]:
sns.set_style('whitegrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'


In [None]:
f, axes = plt.subplots(1,1)
g1 = sns.histplot(data["Age"], color="red",ax = axes,kde=True)
plt.title("Distribution of age");

In [None]:
sns.violinplot(x="Survived",y="Age",data=data);

As we can see from the plot, the median age for those who survived is lower, and there also seems to be smaller variation in these ages.

In [None]:
# is the chance of survival different for different countries of origin?
data.groupby("Country")["Survived"].mean().plot(kind="bar");

In [None]:
plotp=data.groupby("Survived")["Survived"].count()
plotp.plot.pie(autopct="%.1f%%");

The pie plot not only shows the magnitute of the disaster, it also hints us that the data are not balanced and it may cause problems to our model.

# Pre Processing

In [None]:
data = data[['Sex','Age','Category', 'Survived',"Country"]]
#we remove the name collumns since they hold no value for the model

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
data.Category=labelencoder.fit_transform(data["Category"])
data.Sex=labelencoder.fit_transform(data["Sex"])
print(data)
# Female=0 male=1, Crew=0, passenger=1

In [None]:
#since the variable Country is not binary we need to make dummies
data = pd.get_dummies(data,drop_first=True)
data.head()

In [None]:
data.isna().sum()
#fortunatly there are no missing values

# train_test_split

In [None]:
y = data['Survived']
X = data.drop(columns=['Survived'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

# Model

We chose a Decision Tree Classifier for our model



In [None]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {"min_samples_leaf": range(1, 9),
              "criterion": ["gini", "entropy"]}

# Instantiate a Decision Tree classifier: tree
tree = DecisionTreeClassifier()

tree_cv = GridSearchCV(tree, param_dist, cv=5)

# Fit it to the data
tree_cv.fit(X_train, y_train)

# Print the tuned parameters and score
print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

In [None]:
# Predict the labels of the test data: y_pred
y_pred = tree_cv.predict(X_test)

# Generate the confusion matrix 
cm0=confusion_matrix(y_test, y_pred)
print(classification_report(y_test, y_pred))

In [None]:
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm0, annot=True, linewidth=0.7, linecolor='cyan', fmt='g', ax=ax, cmap="BuPu")
plt.title('Confusion Matrix')
plt.xlabel('Y predict')
plt.ylabel('Y test')
plt.show()

**As we saw in the vizualization stage our data suffer from inbalance. As such our model can not work properly because even though we have a high score, recall for the class Survived=1 is 0,07. In order to solve this, we use oversampling.**

# Oversampling and re-fit

In [None]:
#Import the SMOTE-NC
from imblearn.over_sampling import SMOTENC
#Create the oversampler. For SMOTE-NC we need to pinpoint the column position where is the categorical features are.
smotenc = SMOTENC([0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17],random_state = 101)

X_oversample, y_oversample = smotenc.fit_resample(X_train, y_train)

In [None]:
# Re-Fit it to the oversampled data
tree_cv.fit(X_oversample, y_oversample)


print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

In [None]:
model = tree_cv.best_estimator_


In [None]:
# Predict the labels of the test data: y_pred
y_pred = model.predict(X_test)

# Generate the confusion matrix 
cm=confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

In [None]:
f, ax = plt.subplots(figsize=(5,5))
sns.heatmap(cm, annot=True, linewidth=0.7, linecolor='cyan', fmt='g', ax=ax, cmap="BuPu")
plt.title('Confusion Matrix')
plt.xlabel('Y predict')
plt.ylabel('Y test')
plt.show()

As we can see the oversampling worked. Our model is now better at predicting the class Survived=1. unfortunately recall for the first class, slightly dropped.

# Metrics

In [None]:
from sklearn.metrics import roc_curve

# Compute predicted probabilities: y_pred_prob
y_pred_prob = model.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()

Compairing the models with and without oversampling



In [None]:
fig = plt.figure(figsize=(15,15))
ax1 = fig.add_subplot(2, 2, 1) 
ax1.set_title('Decision tree no oversampling') 
ax2 = fig.add_subplot(2, 2, 2) 
ax2.set_title('Decision tree with oversampling')


sns.heatmap(cm0, annot=True, linewidth=0.7, linecolor='red',cmap="BuPu" ,fmt='g', ax=ax1)
sns.heatmap(cm, annot=True, linewidth=0.7, linecolor='red',cmap="BuPu" ,fmt='g', ax=ax2)  
plt.show()

In [None]:
model.feature_importances_

In [None]:
dataf=data.drop(["Survived"], axis=1)

In [None]:
def plot_feature_importance(importance,names,model_type):
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + 'FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')


    
plot_feature_importance(model.feature_importances_,dataf.columns,'Decision Tree ')

As we can see, for most countries, the origin of the passenger plays no role in the prediction.



We will remove the unnecessary features and re fit the model



# Final re-fit


In [None]:
y = data['Survived']
X = data[['Age',"Sex","Category","Country_Sweden","Country_Latvia","Country_Russia","Country_Estonia"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [None]:
smotenc = SMOTENC([0,2,3,4,5,6],random_state = 101)

X_oversample, y_oversample = smotenc.fit_resample(X_train, y_train)

In [None]:
tree_cv.fit(X_oversample, y_oversample)


print("Tuned Decision Tree Parameters: {}".format(tree_cv.best_params_))
print("Best score is {}".format(tree_cv.best_score_))

In [None]:
# Predict the labels of the test data: y_pred
y_pred = tree_cv.predict(X_test)

# Generate the confusion matrix 
cm3=confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

# Final comparison of the 3 models

In [None]:
fig = plt.figure(figsize=(15,15))
ax1 = fig.add_subplot(3, 3, 1) 
ax1.set_title('Decision tree no oversampling') 
ax2 = fig.add_subplot(3, 3, 2) 
ax2.set_title('Decision tree with oversampling')
ax3 = fig.add_subplot(3, 3, 3) 
ax3.set_title('Decision tree final')

sns.heatmap(cm0, annot=True, linewidth=0.7, linecolor='red',cmap="BuPu" ,fmt='g', ax=ax1)
sns.heatmap(cm, annot=True, linewidth=0.7, linecolor='red',cmap="BuPu" ,fmt='g', ax=ax2)  
sns.heatmap(cm3, annot=True, linewidth=0.7, linecolor='red',cmap="BuPu" ,fmt='g', ax=ax3)  
plt.show()

* In the end, the model that we will choose depends on its future usage and the cost of the false positives for each class.
* **All in all i would say that the last model is the better of the 3 since it has better average recall and f1-score.**