<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files

In [None]:
files.upload()

In [None]:
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [None]:
!kaggle competitions download -c titanic

In [None]:
!mkdir train
!unzip train.zip -d train

# START

In [491]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from sklearn.preprocessing import LabelEncoder

In [422]:
train_df = pd.read_csv('train.csv').dropna()
test_df = pd.read_csv('test.csv').dropna()

In [423]:
train_df['AgeCluster'] = pd.cut(train_df['Age'], bins=[0, 10, 20, 40, 60, 120], include_lowest=True, labels=[0, 1, 2, 3, 4])
train_df['FareCluster'] = pd.cut(train_df['Fare'], bins=[0, 51, 101, 2000], include_lowest=True, labels=[0, 1, 2])
categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'AgeCluster', 'FareCluster']

In [425]:
for column in categorical_columns:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])

for column in categorical_columns:
  train_df[column] = train_df[column].astype('category')

In [434]:
pclass = train_df['Pclass'].cat.codes.values
sex = train_df['Sex'].cat.codes.values
sibsp = train_df['SibSp'].cat.codes.values
parch = train_df['Parch'].cat.codes.values
embarked = train_df['Embarked'].cat.codes.values
ageCluster = train_df['AgeCluster'].cat.codes.values
fareCluster = train_df['FareCluster'].cat.codes.values
categorical_data = np.stack([pclass, sex, sibsp, parch, embarked, ageCluster, fareCluster], 1)

In [437]:
tensor_data = torch.tensor(categorical_data, dtype=torch.int64)
tensor_output = torch.tensor(train_df['Survived'].values).flatten()

In [439]:
categorical_columns_size = [len(train_df[column].cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_columns_size]
total_records = len(train_df)
test_records = int(total_records * 0.2)

In [440]:
tensor_train_data = tensor_data[:total_records-test_records]
tensor_test_data = tensor_data[total_records-test_records:total_records]
tensor_train_output = tensor_output[:total_records-test_records]
tensor_test_output = tensor_output[total_records-test_records:total_records]

In [553]:
class Model(nn.Module):
  def __init__(self, embedding_sizes):
    super().__init__()
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
    n_emb = 14
    self.lin1 = nn.Linear(n_emb, 200)
    self.lin2 = nn.Linear(200, 70)
    self.lin3 = nn.Linear(70, 2)
    self.bn1 = nn.BatchNorm1d(14)
    self.bn2 = nn.BatchNorm1d(200)
    self.bn3 = nn.BatchNorm1d(70)
    self.emb_drop = nn.Dropout(0.6)
    self.drops = nn.Dropout(0.3)
  def forward(self, x_cat):
    x = [e(x_cat[:,i]) for i, e in enumerate(self.embeddings)]
    x = torch.cat(x, 1)
    x = self.emb_drop(x)
    x = self.bn1(x)
    x = self.lin1(x)
    x = self.drops(x)
    x = self.bn2(x)
    x = F.relu(self.lin2(x))
    x = self.drops(x)
    x = self.bn3(x)
    x = self.lin3(x)
    return x

In [561]:
model = Model(categorical_embedding_sizes)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [562]:
epochs = 1000
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(tensor_train_data)
    single_loss = loss_function(y_pred, tensor_train_output)

    aggregated_losses.append(single_loss)
    if i%100 == 1:
        print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

epoch: 1	loss: 0.7676478028297424
epoch: 101	loss: 0.5549042820930481
epoch: 201	loss: 0.5383870601654053
epoch: 301	loss: 0.5352759957313538
epoch: 401	loss: 0.5101067423820496
epoch: 501	loss: 0.5223804116249084
epoch: 601	loss: 0.5155031085014343
epoch: 701	loss: 0.4847456216812134
epoch: 801	loss: 0.49148643016815186
epoch: 901	loss: 0.522858202457428
epoch: 1000	loss: 0.49923235177993774


In [557]:
with torch.no_grad():
    y_val = model(tensor_test_data)
    loss = loss_function(y_val.squeeze(), tensor_test_output)
print("Loss: " + str(loss))

Loss: tensor(0.7064)


In [558]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_val_sklearn = np.argmax(y_val, axis=1)

print("\nconfusion_matrix")
print(confusion_matrix(tensor_test_output, y_val_sklearn))
print("\nclassification_report")
print(classification_report(tensor_test_output, y_val_sklearn))
print("\naccuracy_score")
print(accuracy_score(tensor_test_output, y_val_sklearn))


confusion_matrix
[[ 2  8]
 [11 15]]

classification_report
              precision    recall  f1-score   support

           0       0.15      0.20      0.17        10
           1       0.65      0.58      0.61        26

    accuracy                           0.47        36
   macro avg       0.40      0.39      0.39        36
weighted avg       0.51      0.47      0.49        36


accuracy_score
0.4722222222222222
