<u><h1>Testing some machine learning models on Kaggle's Titanic dataset</u></h1>

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Import necessary packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
import tensorflow as tf
tf.autograph.set_verbosity(0)
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [2]:
# Read Titanic training dataset
df_train_all = pd.read_csv("./data/titanic_train.csv")

# Preview the data
df_train_all.info()
df_train_all.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Show some info about the data

df_train_all["Survived"].value_counts()

df_train_all[["Sex", "Survived"]].groupby(["Sex"], as_index=False).mean().sort_values(by="Survived", ascending=False)

df_train_all[["Pclass", "Survived"]].groupby(["Pclass"], as_index=False).mean().sort_values(by="Survived", ascending=False)

0    549
1    342
Name: Survived, dtype: int64

Unnamed: 0,Sex,Survived
0,female,0.742038
1,male,0.188908


Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [4]:
def preprocess_data(df):
    # Drop some columns
    columns = [
        "PassengerId",
        "Name",
        "Ticket",
        "Cabin",
        "Embarked", # Port of Embarkation (Cherbourg, Queenstown, Southampton)
        ]
    df = df.drop(columns, axis=1)

    # Transform categorical data
    df["Sex"] = LabelEncoder().fit_transform(df["Sex"])

    # We could just drop the rows with NaN values, but we'd lose almost 200 examples
    # df = df.dropna()

    # Instead, we can impute the values, but fist we have to normalize the data
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns=df.columns)
    df = pd.DataFrame(KNNImputer().fit_transform(df), columns=df.columns)

    return df

In [5]:
# Pre-process data
df_train = preprocess_data(df_train_all)

# Preview the data again
df_train.info()
df_train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Sex       891 non-null    float64
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    float64
 5   Parch     891 non-null    float64
 6   Fare      891 non-null    float64
dtypes: float64(7)
memory usage: 48.9 KB


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0.0,1.0,1.0,0.271174,0.125,0.0,0.014151
1,1.0,0.0,0.0,0.472229,0.125,0.0,0.139136
2,1.0,1.0,0.0,0.321438,0.0,0.0,0.015469
3,1.0,0.0,0.0,0.434531,0.125,0.0,0.103644
4,0.0,1.0,1.0,0.434531,0.0,0.0,0.015713


In [6]:
# Separate our Xs and Ys
X = df_train.drop(["Survived"], axis=1).to_numpy()
Y = np.squeeze(df_train[["Survived"]].to_numpy())

# Split data into training and validation examples
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=42, stratify=Y)

# Show our training and validation sizes
print("Training examples : X->", X_train.shape, ", Y->", Y_train.shape)
print("Validation examples : X->", X_val.shape, ", Y->", Y_val.shape)

Training examples : X-> (668, 6) , Y-> (668,)
Validation examples : X-> (223, 6) , Y-> (223,)


In [7]:
# Decision Tree
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, Y_train)
print("Training accuracy: ", round(decision_tree.score(X_train, Y_train) * 100, 2))
print("Validation accuracy: ", round(decision_tree.score(X_val, Y_val) * 100, 2))

DecisionTreeClassifier(random_state=42)

Training accuracy:  98.95
Validation accuracy:  74.44


In [8]:
# Random forest
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train, Y_train)
print("Training accuracy: ", round(random_forest.score(X_train, Y_train) * 100, 2))
print("Validation accuracy: ", round(random_forest.score(X_val, Y_val) * 100, 2))

RandomForestClassifier(random_state=42)

Training accuracy:  98.95
Validation accuracy:  80.72


In [9]:
# Perceptron
perceptron = Perceptron(penalty='l2', random_state=42)
perceptron.fit(X_train, Y_train)
print("Training accuracy: ", round(perceptron.score(X_train, Y_train) * 100, 2))
print("Validation accuracy: ", round(perceptron.score(X_val, Y_val) * 100, 2))

Perceptron(penalty='l2', random_state=42)

Training accuracy:  79.04
Validation accuracy:  77.13


In [10]:
# NN
tf.random.set_seed(42)
neural_network = tf.keras.models.Sequential()
neural_network.add(tf.keras.layers.Dense(64, activation='relu', input_dim=X_train.shape[1]))
neural_network.add(tf.keras.layers.Dense(64, activation='relu'))
neural_network.add(tf.keras.layers.Dense(64, activation='relu'))
neural_network.add(tf.keras.layers.Dense(1, activation='sigmoid'))
neural_network.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
print(neural_network.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                448       
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 8,833
Trainable params: 8,833
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
neural_network.fit(X_train, Y_train, epochs=20, batch_size=64, verbose=1, validation_data=(X_val, Y_val), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2ea1637b1c0>

In [12]:
print("Training accuracy: ", round(neural_network.evaluate(X_train, Y_train, batch_size=64)[1] * 100, 2))
print("Validation accuracy: ", round(neural_network.evaluate(X_val, Y_val, batch_size=64)[1] * 100, 2))

Training accuracy:  83.83
Validation accuracy:  82.06


In [13]:
# Read test_data
df_test_all = pd.read_csv("./data/titanic_test.csv")
df_test_all.info()
df_test_all.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [14]:
# Preprocess data
df_test = preprocess_data(df_test_all)

# Preview the test data
df_test.info()
df_test.head()

# Convert to numpy array
X_test = df_test.to_numpy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Pclass  418 non-null    float64
 1   Sex     418 non-null    float64
 2   Age     418 non-null    float64
 3   SibSp   418 non-null    float64
 4   Parch   418 non-null    float64
 5   Fare    418 non-null    float64
dtypes: float64(6)
memory usage: 19.7 KB


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,1.0,1.0,0.452723,0.0,0.0,0.015282
1,1.0,0.0,0.617566,0.125,0.0,0.013663
2,0.5,1.0,0.815377,0.0,0.0,0.018909
3,1.0,1.0,0.353818,0.0,0.0,0.016908
4,1.0,0.0,0.287881,0.125,0.111111,0.023984


In [15]:
# Predict on test dataset and save results for Kaggle
predictions = decision_tree.predict(X_test)
df_predictions = pd.DataFrame({'PassengerId': df_test_all.PassengerId, "Survived": predictions })
df_predictions['Survived'] = df_predictions['Survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/decision_tree.csv", index=False)

predictions = random_forest.predict(X_test)
df_predictions = pd.DataFrame({'PassengerId': df_test_all.PassengerId, "Survived": predictions })
df_predictions['Survived'] = df_predictions['Survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/random_forest.csv", index=False)

predictions = perceptron.predict(X_test)
df_predictions = pd.DataFrame({'PassengerId': df_test_all.PassengerId, "Survived": predictions })
df_predictions['Survived'] = df_predictions['Survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/perceptron.csv", index=False)

predictions = np.squeeze(neural_network.predict(X_test))
df_predictions = pd.DataFrame({'PassengerId': df_test_all.PassengerId, "Survived": predictions })
df_predictions['Survived'] = df_predictions['Survived'].round(0).astype(int)
df_predictions.to_csv("./outputs/neural_network.csv", index=False)

# Preview results
df_predictions.head()

print("All done")

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


All done


<u><h2>Accuracy results on test data according to Kaggle</h2></u>
<hr/>
<li>Decision Tree : <b>72.248%</b></li>
<li>Random Forest : <b>75.598%</b></li>
<li>Perceptron : <b>71.770%</b></li>
<li>Neural Network : <b>76.976%</b></li>
<hr/>
These simple examples can definitely be improved, first by better selecting features, like in: https://anelmusic13.medium.com/how-to-score-top-3-in-kaggles-titanic-machine-learning-from-disaster-competition-13d056e262b1

Also, we could improve the models with (hyper)parameters search and cross-validation.
