## In this notebook

- Training Titanic survival model.

In [1]:
import os

import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
DATA_PATH = "data.csv"
Y_COLUMN = "survived"
X_COLUMNS = [
    "class",
    "age",
    "sib_sp_cnt",
    "par_child_cnt",
    "fare",
    "is_male",
]
MODEL_FILE_PATH = "model.pickle"

## Prepare data

In [3]:
# load data
df = pd.read_csv(DATA_PATH)

# rename columns
df = df.rename(columns={
    "PassengerId": "passenger_id",
    "Survived": "survived",
    "Pclass": "class",
    "Name": "name",
    "Sex": "sex",
    "Age": "age",
    "SibSp": "sib_sp_cnt",
    "Parch": "par_child_cnt",
    "Ticket": "ticket",
    "Fare": "fare",
    "Cabin": "cabin",
    "Embarked": "embarked",
})

# drop unused columns
df = df.drop([
    "passenger_id",
    "name",
    "ticket",
    "cabin",
    "embarked",
], axis=1)

df.tail()

Unnamed: 0,survived,class,sex,age,sib_sp_cnt,par_child_cnt,fare
886,0,2,male,27.0,0,0,13.0
887,1,1,female,19.0,0,0,30.0
888,0,3,female,,1,2,23.45
889,1,1,male,26.0,0,0,30.0
890,0,3,male,32.0,0,0,7.75


In [4]:
# drop missing values

df = (
    df
    .dropna()
    .reset_index(drop=True)
)

df.tail()

Unnamed: 0,survived,class,sex,age,sib_sp_cnt,par_child_cnt,fare
709,0,3,female,39.0,0,5,29.125
710,0,2,male,27.0,0,0,13.0
711,1,1,female,19.0,0,0,30.0
712,1,1,male,26.0,0,0,30.0
713,0,3,male,32.0,0,0,7.75


In [5]:
# encode sex as to boolean

df["is_male"] = (df.sex == "male").astype(int)
df = df.drop(["sex"], axis=1)

df.tail()

Unnamed: 0,survived,class,age,sib_sp_cnt,par_child_cnt,fare,is_male
709,0,3,39.0,0,5,29.125,0
710,0,2,27.0,0,0,13.0,1
711,1,1,19.0,0,0,30.0,0
712,1,1,26.0,0,0,30.0,1
713,0,3,32.0,0,0,7.75,1


In [6]:
# split to train and test datasets

X_train, X_test, y_train, y_test = train_test_split(
    df[[col for col in df.columns if col != Y_COLUMN]], 
    df[Y_COLUMN], 
    test_size=0.15, 
    random_state=42
)

print(f"{X_train.shape=}")
print(f"{y_train.shape=}")

print(f"{X_test.shape=}")
print(f"{y_test.shape=}")

X_train.shape=(606, 6)
y_train.shape=(606,)
X_test.shape=(108, 6)
y_test.shape=(108,)


## Train model

In [7]:
# fit model

clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_train, y_train)

In [8]:
# check performance on test dataset

y_test_pred = clf.predict(X_test)

print("Results:")
print(f"- accuracy: {accuracy_score(y_test, y_test_pred):,.3f}")
print(f"- confusion matrix: \n{confusion_matrix(y_test, y_test_pred)}")

Results:
- accuracy: 0.741
- confusion matrix: 
[[49 16]
 [12 31]]


## Save model

In [9]:
# save model

with open(MODEL_FILE_PATH, "wb") as file_with_model:
    pickle.dump(clf, file_with_model)

In [10]:
# load the model

# with open(MODEL_FILE_PATH, "rb") as input_file:
#     clf_loaded = pickle.load(input_file)

## Results

- Model is trained and exported into pickle file.