In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, confusion_matrix

In [2]:
data = pd.read_csv("data/titanic.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Dropping the cabin column(axis = 1) from the dataset
data_alt = data.drop(['Cabin'], axis = 1)

In [4]:
# Filling the empty age rows with the mean of Age. 
# This can be done better by using the mean of each gender in a given class.
# And also using their title. (Mr, Miss, ...)
data_alt['Age'] = data_alt['Age'].fillna(value = int(data_alt['Age'].mean()))

In [5]:
# Removing rows where the person has not Embarked the boat(or not registerd).
data_alt = data_alt[data_alt['Embarked'].notna()]

In [6]:
# Checking for row duplications: None found
data_alt.duplicated().values.sum()

0

In [7]:
# Loading the second .csv file and checking the structure of the data
ports = pd.read_csv("data/port_abbrevations.csv")
ports.head()

Unnamed: 0,Abbrevation,Port_name
0,C,Cherbourg
1,Q,Queenstown
2,S,Southampton


In [8]:
# Merging the data by the columns Embarked and Abbrecation
data_ext = pd.merge(data_alt, ports[['Abbrevation', 'Port_name']], left_on=['Embarked'], right_on=['Abbrevation'])


In [9]:
# Dropping the columns containing the same information as Port_name.
# Embarked and Abbrevation
data_clean = data_ext.drop(['Embarked', 'Abbrevation'], axis=1)

In [10]:
#It might be interesiting to use the titles in the name for something. But Im looking away from it for now

In [11]:
# Since the name and passanger id is irrelevant for the model 
# it seems fitting to remove these columns as well
data_cleaner = data_clean.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

In [12]:
# Writing cleaned data to a .csv file
data_cleaner.to_csv('data/cleaned_titanic_data.csv')

Cleaning done

## Data prepping

In [13]:
# Loading cleaned data 
clean_titanic = pd.read_csv('data/cleaned_titanic_data.csv')

In [14]:
# Encoding gender using pandas dataframe logic
clean_titanic.loc[clean_titanic['Sex'] == 'male', 'Sex'] = 0
clean_titanic.loc[clean_titanic['Sex'] == 'female', 'Sex'] = 1
clean_titanic

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Port_name
0,0,0,3,0,22.0,1,0,7.2500,Southampton
1,1,1,3,1,26.0,0,0,7.9250,Southampton
2,2,1,1,1,35.0,1,0,53.1000,Southampton
3,3,0,3,0,35.0,0,0,8.0500,Southampton
4,4,0,1,0,54.0,0,0,51.8625,Southampton
...,...,...,...,...,...,...,...,...,...
884,884,0,3,0,29.0,0,0,7.7500,Queenstown
885,885,0,3,0,29.0,0,0,6.9500,Queenstown
886,886,1,3,0,29.0,0,0,7.7500,Queenstown
887,887,0,3,1,39.0,0,5,29.1250,Queenstown


In [15]:
# Encoding gender using my own probably not as efficient logic

# o_h_sex =[]
# for i in range(len(clean_titanic['Sex'])):
#     if clean_titanic['Sex'].iloc[i] == 'male':
#         o_h_sex.append(0)
#     else:
#         o_h_sex.append(1)

# clean_titanic['sex_one_hot'] = o_h_sex

In [16]:
# Encoding Port_name using pandas dataframe logic,
# For convenience there is a underscore before the name of the city

clean_titanic.loc[clean_titanic['Port_name'] == ' Cherbourg', 'Port_name'] = 0
clean_titanic.loc[clean_titanic['Port_name'] == ' Queenstown', 'Port_name'] = 1
clean_titanic.loc[clean_titanic.Port_name == ' Southampton', 'Port_name'] = 2

In [17]:
# One hot encoding of ports and class

# port_dummy = pd.get_dummies(clean_titanic.Port_name, prefix='port')
# class_dummy = pd.get_dummies(clean_titanic.Pclass, prefix='class')
# titanic_port = pd.concat([clean_titanic, port_dummy], axis=1)
# titanic_dummy = pd.concat([titanic_port, class_dummy], axis=1)
# titanic_dummy

In [18]:
clean_titanic

Unnamed: 0.1,Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Port_name
0,0,0,3,0,22.0,1,0,7.2500,2
1,1,1,3,1,26.0,0,0,7.9250,2
2,2,1,1,1,35.0,1,0,53.1000,2
3,3,0,3,0,35.0,0,0,8.0500,2
4,4,0,1,0,54.0,0,0,51.8625,2
...,...,...,...,...,...,...,...,...,...
884,884,0,3,0,29.0,0,0,7.7500,1
885,885,0,3,0,29.0,0,0,6.9500,1
886,886,1,3,0,29.0,0,0,7.7500,1
887,887,0,3,1,39.0,0,5,29.1250,1


## Analysis

In [19]:
# Separating the label from the other features in the dataset
y = clean_titanic.Survived
X = clean_titanic.drop(['Survived'], axis=1)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1234)

In [20]:
# Function to calculate mean absolut error on the predictionresults
# This MAE will be used to choose a number of how many leafs is neccesary.

def calc_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1234)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [21]:
list_of_candidates = list(np.arange(2, 250, 5))

# Write loop to find the ideal tree size from list_of_candidates
initial_error = 100000
for leaf_dept in list_of_candidates:
    test_mae = calc_mae(leaf_dept, train_X, val_X, train_y, val_y)
    # keeping the leaf arrangment where mae is smallest.
    if initial_error > test_mae:
        initial_error = test_mae
        best_tree_size = leaf_dept
        
# Store the best size of tree structure
print(best_tree_size)

37


In [22]:
DT_model = DecisionTreeClassifier(max_leaf_nodes=best_tree_size, random_state=1234)

In [23]:
DT_model.fit(train_X, train_y)

DecisionTreeClassifier(max_leaf_nodes=37, random_state=1234)

In [24]:
pred_y = DT_model.predict(val_X)

In [25]:
val_y = list(val_y)

In [26]:
tn, fp, fn, tp = confusion_matrix(val_y, pred_y, labels=None, sample_weight=None, normalize=None).ravel()
print(tn, fp, fn, tp )

119 18 20 66


In [27]:
true_pos, true_neg, false_pos, false_neg = (0, 0, 0, 0)

for i in range(len(val_y)):
    if val_y[i] == pred_y[i]:
        if val_y[i] == 1:
            true_pos += 1
        else:
            true_neg += 1
            
    if val_y[i] != pred_y[i]:
        if val_y[i] == 1:
            false_neg += 1
        else:
            false_pos += 1
            
print(true_neg, false_pos, false_neg, true_pos)

119 18 20 66


In [28]:
# recall = True positive / true positive + false negative
# precision = True positive / true posistive + false positive
recall = true_pos/(true_pos + false_neg)
precision = true_pos/(true_pos + false_pos)
print('Recall: ', recall, ', Precision: ', precision)

Recall:  0.7674418604651163 , Precision:  0.7857142857142857


In [29]:
rf_model = RandomForestClassifier(max_leaf_nodes=best_tree_size, random_state=1234)
rf_model.fit(train_X, train_y)
rf_pred_y = rf_model.predict(val_X)

val_y = list(val_y)

true_pos, true_neg, false_pos, false_neg = (0, 0, 0, 0)

for i in range(len(val_y)):
    if val_y[i] == rf_pred_y[i]:
        if val_y[i] == 1:
            true_pos += 1
        else:
            true_neg += 1
            
    if val_y[i] != rf_pred_y[i]:
        if val_y[i] == 1:
            false_neg += 1
        else:
            false_pos += 1
            
print(true_neg, false_pos, false_neg, true_pos)

# recall = True positive / true positive + false negative
# precision = True positive / true posistive + false positive
recall = true_pos/(true_pos + false_neg)
precision = true_pos/(true_pos + false_pos)
print('Recall: ', recall, ', Precision: ', precision)

123 14 23 63
Recall:  0.7325581395348837 , Precision:  0.8181818181818182


In [30]:
from sklearn.ensemble import AdaBoostClassifier
rf_model = AdaBoostClassifier(random_state=1234)
rf_model.fit(train_X, train_y)
rf_pred_y = rf_model.predict(val_X)

val_y = list(val_y)

true_pos, true_neg, false_pos, false_neg = (0, 0, 0, 0)

for i in range(len(val_y)):
    if val_y[i] == rf_pred_y[i]:
        if val_y[i] == 1:
            true_pos += 1
        else:
            true_neg += 1
            
    if val_y[i] != rf_pred_y[i]:
        if val_y[i] == 1:
            false_neg += 1
        else:
            false_pos += 1
            
print(true_neg, false_pos, false_neg, true_pos)

# recall = True positive / true positive + false negative
# precision = True positive / true posistive + false positive
recall = true_pos/(true_pos + false_neg)
precision = true_pos/(true_pos + false_pos)
print('Recall: ', recall, ', Precision: ', precision)

119 18 18 68
Recall:  0.7906976744186046 , Precision:  0.7906976744186046


In [31]:
from sklearn.ensemble import GradientBoostingClassifier
rf_model = GradientBoostingClassifier(random_state=1234)
rf_model.fit(train_X, train_y)
rf_pred_y = rf_model.predict(val_X)

val_y = list(val_y)

true_pos, true_neg, false_pos, false_neg = (0, 0, 0, 0)

for i in range(len(val_y)):
    if val_y[i] == rf_pred_y[i]:
        if val_y[i] == 1:
            true_pos += 1
        else:
            true_neg += 1
            
    if val_y[i] != rf_pred_y[i]:
        if val_y[i] == 1:
            false_neg += 1
        else:
            false_pos += 1
            
print(true_neg, false_pos, false_neg, true_pos)

# recall = True positive / true positive + false negative
# precision = True positive / true posistive + false positive
recall = true_pos/(true_pos + false_neg)
precision = true_pos/(true_pos + false_pos)
print('Recall: ', recall, ', Precision: ', precision)

121 16 22 64
Recall:  0.7441860465116279 , Precision:  0.8


In [32]:
from sklearn.experimental import enable_hist_gradient_boosting  
from sklearn.ensemble import HistGradientBoostingClassifier
rf_model = HistGradientBoostingClassifier(random_state=1234)
rf_model.fit(train_X, train_y)
rf_pred_y = rf_model.predict(val_X)

val_y = list(val_y)

true_pos, true_neg, false_pos, false_neg = (0, 0, 0, 0)

for i in range(len(val_y)):
    if val_y[i] == rf_pred_y[i]:
        if val_y[i] == 1:
            true_pos += 1
        else:
            true_neg += 1
            
    if val_y[i] != rf_pred_y[i]:
        if val_y[i] == 1:
            false_neg += 1
        else:
            false_pos += 1
            
print(true_neg, false_pos, false_neg, true_pos)

# recall = True positive / true positive + false negative
# precision = True positive / true posistive + false positive
recall = true_pos/(true_pos + false_neg)
precision = true_pos/(true_pos + false_pos)
print('Recall: ', recall, ', Precision: ', precision)

120 17 19 67
Recall:  0.7790697674418605 , Precision:  0.7976190476190477
