<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files

In [None]:
files.upload()

In [4]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [None]:
!kaggle competitions download -c titanic

In [None]:
!mkdir train
!unzip train.zip -d train

# START

In [509]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from sklearn.preprocessing import LabelEncoder

In [510]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
# Diplay null values of each column
train_df.isnull().sum()

In [512]:
# Manage titles
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

def replace_titles(x): # Normalize the titles
    title = x['TitleCluster']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

train_df['TitleCluster'] = train_df['Name'].map(lambda x: get_title(x))
train_df['TitleCluster'] = train_df.apply(replace_titles, axis=1)

test_df['TitleCluster'] = test_df['Name'].map(lambda x: get_title(x))
test_df['TitleCluster'] = test_df.apply(replace_titles, axis=1)

In [513]:
# Manage tickets
train_df['TicketCluster'] = train_df['Ticket'].str[0]
train_df['TicketCluster'] = np.where(train_df["TicketCluster"].str.isdigit(), "X", train_df["TicketCluster"])

test_df['TicketCluster'] = test_df['Ticket'].str[0]
test_df['TicketCluster'] = np.where(test_df["TicketCluster"].str.isdigit(), "X", test_df["TicketCluster"])

In [514]:
# Manage missing age
def fill_missing_age(x):
    age = x['Age']
    title = x['TitleCluster']
    if np.isnan(age):
      if title == 'Dona':
          return 39
      elif title == 'Master':
          return 6
      elif title == 'Miss':
          return 21
      elif title == 'Mr':
        return 32
      elif title == 'Mrs':
        return 37
    else:
      return age

train_df['Age'] = train_df.apply(fill_missing_age, axis=1)
test_df['Age'] = test_df.apply(fill_missing_age, axis=1)

In [515]:
# Manage missing Embarked
def fill_missing_embarked(x):
    embarked = x['Embarked']
    if embarked != embarked:
      return "X"
    else:
      return embarked

train_df['Embarked'] = train_df.apply(fill_missing_embarked, axis=1)
test_df['Embarked'] = test_df.apply(fill_missing_embarked, axis=1)

In [None]:
# train_df['Embarked'].hist(bins=10)

In [516]:
train_df['AgeCluster'] = pd.cut(train_df['Age'], bins=[0, 5, 10, 20, 30, 40, 50, 60, 70, 81], include_lowest=True, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
train_df['FareCluster'] = pd.cut(train_df['Fare'], bins=[0, 51, 101, 2000], include_lowest=True, labels=[0, 1, 2])

test_df['AgeCluster'] = pd.cut(test_df['Age'], bins=[0, 5, 10, 20, 30, 40, 50, 60, 70, 81], include_lowest=True, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
test_df['FareCluster'] = pd.cut(test_df['Fare'], bins=[0, 51, 101, 2000], include_lowest=True, labels=[0, 1, 2])

categorical_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'AgeCluster', 'FareCluster', 'Embarked', 'TicketCluster', 'TitleCluster']

In [517]:
for column in categorical_columns:
    train_df[column] = LabelEncoder().fit_transform(train_df[column])
    test_df[column] = LabelEncoder().fit_transform(test_df[column])
for column in categorical_columns:
  train_df[column] = train_df[column].astype('category')
  test_df[column] = test_df[column].astype('category')

In [518]:
pclass_train = train_df['Pclass'].cat.codes.values
sex_train = train_df['Sex'].cat.codes.values
sibsp_train = train_df['SibSp'].cat.codes.values
parch_train = train_df['Parch'].cat.codes.values
ageCluster_train = train_df['AgeCluster'].cat.codes.values
fareCluster_train = train_df['FareCluster'].cat.codes.values
embarked_train = train_df['Embarked'].cat.codes.values
ticketCluster_train = train_df['TicketCluster'].cat.codes.values
titleCluster_train = train_df['TitleCluster'].cat.codes.values
categorical_data_train = np.stack([pclass_train, sex_train, sibsp_train, parch_train, ageCluster_train, fareCluster_train, embarked_train, ticketCluster_train, titleCluster_train], 1)

In [519]:
tensor_data_train = torch.tensor(categorical_data_train, dtype=torch.int64)
tensor_output_train = torch.tensor(train_df['Survived'].values).flatten()

tensor_data_test = torch.tensor(categorical_data_test, dtype=torch.int64)

In [520]:
temp_concat = pd.concat([train_df, test_df])
categorical_columns_size = [len(temp_concat[column].astype('category').cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_columns_size]
total_records = len(train_df)
test_records = int(total_records * 0.2)

In [521]:
tensor_train_data = tensor_data_train[:total_records-test_records]
tensor_test_data = tensor_data_train[total_records-test_records:total_records]
tensor_train_output = tensor_output_train[:total_records-test_records]
tensor_test_output = tensor_output_train[total_records-test_records:total_records]

In [585]:
class Model(nn.Module):
  def __init__(self, embedding_sizes):
    super().__init__()
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
    n_emb = sum(e.embedding_dim for e in self.embeddings)
    self.lin1 = nn.Linear(n_emb, 200)
    self.lin2 = nn.Linear(200, 70)
    self.lin3 = nn.Linear(70, 2)
    self.bn1 = nn.BatchNorm1d(n_emb)
    self.bn2 = nn.BatchNorm1d(200)
    self.bn3 = nn.BatchNorm1d(70)
    self.emb_drop = nn.Dropout(0.6)
    self.drops = nn.Dropout(0.3)
    self.sig = nn.Sigmoid()
  def forward(self, x_cat):
    x = [e(x_cat[:,i]) for i, e in enumerate(self.embeddings)]
    x = torch.cat(x, 1)
    x = self.emb_drop(x)
    x = self.bn1(x)
    x = self.lin1(x)
    x = self.drops(x)
    x = self.bn2(x)
    x = F.relu(self.lin2(x))
    x = self.drops(x)
    x = self.bn3(x)
    x = self.lin3(x)
    x = self.sig(x)
    return x

In [586]:
model = Model(categorical_embedding_sizes)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [587]:
epochs = 600
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(tensor_train_data)
    single_loss = loss_function(y_pred, tensor_train_output)
    aggregated_losses.append(single_loss)
    if i%200 == 1:
        print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

epoch: 1	loss: 0.6998015642166138
epoch: 201	loss: 0.5130061507225037
epoch: 401	loss: 0.49811744689941406
epoch: 600	loss: 0.501679003238678


In [567]:
with torch.no_grad():
    y_val = model(tensor_test_data)
    loss = loss_function(y_val, tensor_test_output)
print("Loss: " + str(loss))

Loss: tensor(0.4619)


In [568]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_val_sklearn = np.argmax(y_val, axis=1)

print("\nconfusion_matrix")
print(confusion_matrix(tensor_test_output, y_val_sklearn))
print("\nclassification_report")
print(classification_report(tensor_test_output, y_val_sklearn))
print("\naccuracy_score")
print(accuracy_score(tensor_test_output, y_val_sklearn))


confusion_matrix
[[103  12]
 [ 16  47]]

classification_report
              precision    recall  f1-score   support

           0       0.87      0.90      0.88       115
           1       0.80      0.75      0.77        63

    accuracy                           0.84       178
   macro avg       0.83      0.82      0.83       178
weighted avg       0.84      0.84      0.84       178


accuracy_score
0.8426966292134831


In [576]:
# Make predictions
with torch.no_grad():
    y_pred = model(tensor_data_test)
print("Loss: " + str(loss))

Loss: tensor(0.4619)


In [577]:
values, labels = torch.max(y_pred, 1)
survived = labels.data.numpy()

In [578]:
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': survived})
submission_df.head(20)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,0
9,901,0


In [538]:
from google.colab import files
submission_df.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>