<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install -q kaggle

In [3]:
from google.colab import files

In [None]:
files.upload()

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [12]:
!kaggle competitions download -c titanic

test.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv: Skipping, found more recently modified local copy (use --force to force download)
gender_submission.csv: Skipping, found more recently modified local copy (use --force to force download)


In [11]:
!mkdir train
!unzip train.zip -d train

mkdir: cannot create directory ‘train’: File exists
unzip:  cannot find or open train.zip, train.zip.zip or train.zip.ZIP.


# START

In [13]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from sklearn.preprocessing import LabelEncoder

In [39]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [40]:
# Diplay null values of each column
train_df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [46]:
train_df = train_df[~train_df['Age'].isnull()]

In [66]:
train_df = train_df[~train_df['Embarked'].isnull()]

In [151]:
train_df['TicketCluster'] = train_df['Ticket'].str[0]
train_df['TicketCluster'] = np.where(train_df["TicketCluster"].str.isdigit(), "X", train_df["TicketCluster"])

In [None]:
# train_df['TicketCluster'].hist(bins=10)

In [152]:
train_df['AgeCluster'] = pd.cut(train_df['Age'], bins=[0, 5, 10, 20, 30, 40, 50, 60, 70, 81], include_lowest=True, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
train_df['FareCluster'] = pd.cut(train_df['Fare'], bins=[0, 51, 101, 2000], include_lowest=True, labels=[0, 1, 2])
categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'AgeCluster', 'FareCluster', 'Embarked', 'TicketCluster']

In [153]:
for column in categorical_columns:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])
for column in categorical_columns:
  train_df[column] = train_df[column].astype('category')

In [154]:
pclass = train_df['Pclass'].cat.codes.values
sex = train_df['Sex'].cat.codes.values
sibsp = train_df['SibSp'].cat.codes.values
parch = train_df['Parch'].cat.codes.values
ageCluster = train_df['AgeCluster'].cat.codes.values
fareCluster = train_df['FareCluster'].cat.codes.values
embarked = train_df['Embarked'].cat.codes.values
ticketCluster = train_df['TicketCluster'].cat.codes.values
categorical_data = np.stack([pclass, sex, sibsp, parch, ageCluster, fareCluster, embarked, ticketCluster], 1)

In [155]:
tensor_data = torch.tensor(categorical_data, dtype=torch.int64)
tensor_output = torch.tensor(train_df['Survived'].values).flatten()

In [156]:
categorical_columns_size = [len(train_df[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_columns_size]
total_records = len(train_df)
test_records = int(total_records * 0.2)

In [157]:
tensor_train_data = tensor_data[:total_records-test_records]
tensor_test_data = tensor_data[total_records-test_records:total_records]
tensor_train_output = tensor_output[:total_records-test_records]
tensor_test_output = tensor_output[total_records-test_records:total_records]

In [158]:
class Model(nn.Module):
  def __init__(self, embedding_sizes):
    super().__init__()
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
    n_emb = sum(e.embedding_dim for e in self.embeddings)
    self.lin1 = nn.Linear(n_emb, 200)
    self.lin2 = nn.Linear(200, 70)
    self.lin3 = nn.Linear(70, 2)
    self.bn1 = nn.BatchNorm1d(n_emb)
    self.bn2 = nn.BatchNorm1d(200)
    self.bn3 = nn.BatchNorm1d(70)
    self.emb_drop = nn.Dropout(0.6)
    self.drops = nn.Dropout(0.3)
  def forward(self, x_cat):
    x = [e(x_cat[:,i]) for i, e in enumerate(self.embeddings)]
    x = torch.cat(x, 1)
    x = self.emb_drop(x)
    x = self.bn1(x)
    x = self.lin1(x)
    x = self.drops(x)
    x = self.bn2(x)
    x = F.relu(self.lin2(x))
    x = self.drops(x)
    x = self.bn3(x)
    x = self.lin3(x)
    return x

In [166]:
model = Model(categorical_embedding_sizes)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [167]:
epochs = 300
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(tensor_train_data)
    single_loss = loss_function(y_pred, tensor_train_output)

    aggregated_losses.append(single_loss)
    if i%100 == 1:
        print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

epoch: 1	loss: 0.8375767469406128
epoch: 101	loss: 0.6243467926979065
epoch: 201	loss: 0.5678133368492126
epoch: 300	loss: 0.547076404094696


In [168]:
with torch.no_grad():
    y_val = model(tensor_test_data)
    loss = loss_function(y_val.squeeze(), tensor_test_output)
print("Loss: " + str(loss))

Loss: tensor(0.5211)


In [169]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_val_sklearn = np.argmax(y_val, axis=1)

print("\nconfusion_matrix")
print(confusion_matrix(tensor_test_output, y_val_sklearn))
print("\nclassification_report")
print(classification_report(tensor_test_output, y_val_sklearn))
print("\naccuracy_score")
print(accuracy_score(tensor_test_output, y_val_sklearn))


confusion_matrix
[[75 12]
 [21 34]]

classification_report
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        87
           1       0.74      0.62      0.67        55

    accuracy                           0.77       142
   macro avg       0.76      0.74      0.75       142
weighted avg       0.76      0.77      0.76       142


accuracy_score
0.7676056338028169
