In [26]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Import necessary packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
import tensorflow as tf
tf.autograph.set_verbosity(0)
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [27]:
# Read Titanic training dataset
df_train_all = pd.read_csv("./data/titanic_train.csv")

# Preview the data
df_train_all.info()
df_train_all.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   passenger_id  850 non-null    int64  
 1   pclass        850 non-null    int64  
 2   name          850 non-null    object 
 3   sex           850 non-null    object 
 4   age           676 non-null    float64
 5   sibsp         850 non-null    int64  
 6   parch         850 non-null    int64  
 7   ticket        850 non-null    object 
 8   fare          849 non-null    float64
 9   cabin         191 non-null    object 
 10  embarked      849 non-null    object 
 11  boat          308 non-null    object 
 12  body          73 non-null     float64
 13  home.dest     464 non-null    object 
 14  survived      850 non-null    int64  
dtypes: float64(3), int64(5), object(7)
memory usage: 99.7+ KB


Unnamed: 0,passenger_id,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest,survived
0,1216,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q,13.0,,,1
1,699,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S,,,Croatia,0
2,1267,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Go...",female,30.0,1,1,345773,24.15,,S,,,,0
3,449,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S,4.0,,"Cornwall / Akron, OH",1
4,576,2,"Veal, Mr. James",male,40.0,0,0,28221,13.0,,S,,,"Barre, Co Washington, VT",0


In [28]:
# Show some info about the data

df_train_all["survived"].value_counts()

df_train_all[["sex", "survived"]].groupby(["sex"], as_index=False).mean().sort_values(by="survived", ascending=False)

df_train_all[["pclass", "survived"]].groupby(["pclass"], as_index=False).mean().sort_values(by="survived", ascending=False)

0    537
1    313
Name: survived, dtype: int64

Unnamed: 0,sex,survived
0,female,0.702341
1,male,0.186933


Unnamed: 0,pclass,survived
0,1,0.606796
1,2,0.457831
2,3,0.23431


In [29]:
def preprocess_data(df):
    # Drop some columns
    columns = [
        "passenger_id",
        "name",
        "ticket",
        "fare",
        "cabin",
        "embarked", # Port of Embarkation (Cherbourg, Queenstown, Southampton)
        "boat", # Lifeboat
        "body", # Body Identification Number
        "home.dest"] # Home / Destination
    df = df.drop(columns, axis=1)

    # Transform categorical data
    df["sex"] = LabelEncoder().fit_transform(df["sex"])

    # We could just drop the rows with NaN values, but we'd lose almost 200 examples
    # df = df.dropna()

    # Instead, we can impute the values, but fist we have to normalize the data
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns)
    df = pd.DataFrame(KNNImputer().fit_transform(df), columns=df.columns)

    return df

In [30]:
# Pre-process data
df_train = preprocess_data(df_train_all)

# Preview the data again
df_train.info()
df_train.head()

# Separate our Xs and Ys
X = df_train.drop(["survived"], axis=1).to_numpy()
Y = np.squeeze(df_train[["survived"]].to_numpy())

# Split data into training and validation examples
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)

# Show our training and validation sizes
print("Training examples : X->", X_train.shape, ", Y->", Y_train.shape)
print("Validation examples : X->", X_val.shape, ", Y->", Y_val.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   pclass    850 non-null    float64
 1   sex       850 non-null    float64
 2   age       850 non-null    float64
 3   sibsp     850 non-null    float64
 4   parch     850 non-null    float64
 5   survived  850 non-null    float64
dtypes: float64(6)
memory usage: 40.0 KB


Unnamed: 0,pclass,sex,age,sibsp,parch,survived
0,1.0,0.0,0.331106,0.0,0.0,1.0
1,1.0,1.0,0.473904,0.0,0.0,0.0
2,1.0,0.0,0.373695,0.125,0.111111,0.0
3,0.5,0.0,0.674321,0.125,0.333333,1.0
4,0.5,1.0,0.498956,0.0,0.0,0.0


Training examples : X-> (637, 5) , Y-> (637,)
Validation examples : X-> (213, 5) , Y-> (213,)


In [31]:
# Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, Y_train)
print("Training accuracy: ", round(decision_tree.score(X_train, Y_train) * 100, 2))
print("Validation accuracy: ", round(decision_tree.score(X_val, Y_val) * 100, 2))

DecisionTreeClassifier(random_state=42)

Training accuracy:  95.13
Validation accuracy:  81.22


In [32]:
# Random forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, Y_train)
print("Training accuracy: ", round(random_forest.score(X_train, Y_train) * 100, 2))
print("Validation accuracy: ", round(random_forest.score(X_val, Y_val) * 100, 2))

RandomForestClassifier(random_state=42)

Training accuracy:  95.13
Validation accuracy:  85.92


In [33]:
# Perceptron
perceptron = Perceptron(random_state=42)
perceptron.fit(X_train, Y_train)
print("Training accuracy: ", round(perceptron.score(X_train, Y_train) * 100, 2))
print("Validation accuracy: ", round(perceptron.score(X_val, Y_val) * 100, 2))

Perceptron(random_state=42)

Training accuracy:  77.86
Validation accuracy:  76.53


In [34]:
# NN
tf.random.set_seed(42)
neural_network = tf.keras.models.Sequential()
neural_network.add(tf.keras.layers.Dense(64, activation='relu', input_dim=X_train.shape[1]))
neural_network.add(tf.keras.layers.Dense(64, activation='relu'))
neural_network.add(tf.keras.layers.Dense(64, activation='relu'))
neural_network.add(tf.keras.layers.Dense(1, activation='sigmoid'))
neural_network.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
print(neural_network.summary())
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
neural_network.fit(X_train, Y_train, epochs=20, batch_size=64, verbose=1, validation_data=(X_val, Y_val), callbacks=[early_stopping])
print("Training accuracy: ", round(neural_network.evaluate(X_train, Y_train, batch_size=64)[1] * 100, 2))
print("Validation accuracy: ", round(neural_network.evaluate(X_val, Y_val, batch_size=64)[1] * 100, 2))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 64)                384       
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 8,769
Trainable params: 8,769
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x25cc7b7e460>

Training accuracy:  80.85
Validation accuracy:  81.69


In [35]:
# Read test_data
df_test_all = pd.read_csv("./data/titanic_test.csv")

# Preprocess data
df_test = preprocess_data(df_test_all)

# Preview the test data
df_test.info()
df_test.head()

# Convert to numpy array
X_test = df_test.to_numpy()

# Predict on test dataset and save results for Kaggle
predictions = decision_tree.predict(X_test)
df_predictions = pd.DataFrame({'passenger_id': df_test_all.passenger_id, "survived": predictions })
df_predictions['survived'] = df_predictions['survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/decision_tree.csv", index=False)

predictions = random_forest.predict(X_test)
df_predictions = pd.DataFrame({'passenger_id': df_test_all.passenger_id, "survived": predictions })
df_predictions['survived'] = df_predictions['survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/random_forest.csv", index=False)

predictions = perceptron.predict(X_test)
df_predictions = pd.DataFrame({'passenger_id': df_test_all.passenger_id, "survived": predictions })
df_predictions['survived'] = df_predictions['survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/perceptron.csv", index=False)

predictions = np.squeeze(neural_network.predict(X_test))
df_predictions = pd.DataFrame({'passenger_id': df_test_all.passenger_id, "survived": predictions })
df_predictions['survived'] = df_predictions['survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/neural_network.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   pclass  459 non-null    float64
 1   sex     459 non-null    float64
 2   age     459 non-null    float64
 3   sibsp   459 non-null    float64
 4   parch   459 non-null    float64
dtypes: float64(5)
memory usage: 18.1 KB


Unnamed: 0,pclass,sex,age,sibsp,parch
0,0.0,1.0,0.235849,0.0,0.333333
1,1.0,1.0,0.383019,0.0,0.0
2,0.0,1.0,0.433963,0.125,0.0
3,1.0,1.0,0.575472,0.0,0.0
4,1.0,1.0,0.292453,0.0,0.0


<u><h2>Accuracy results on test data according to Kaggle</h2></u>
<hr/>
<li>Decision Tree : <b>60.1%</b></li>
<li>Random Forest : <b>82.6%</b></li>
<li>Perceptron : <b>54.3%</b></li>
<li>Neural Network : <b>90.5%</b></li>

Simple example, can definitely improve by better selecting features, like in: https://anelmusic13.medium.com/how-to-score-top-3-in-kaggles-titanic-machine-learning-from-disaster-competition-13d056e262b1
(split fare into categories, family size, cabin & ticket)

Also, could definitely improve with parameters search and cross-validation.
Decision Tree parameters : (criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
Random Forest parameters : (n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
Perceptron parameters : (penalty=None, alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=1000, tol=0.001, shuffle=True, verbose=0, eta0=1.0, n_jobs=None, random_state=0, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight=None, warm_start=False)
Neural Network parameter : () and hyper-parameters ()