<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files

In [None]:
files.upload()

In [4]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [None]:
!kaggle competitions download -c titanic

In [None]:
!mkdir train
!unzip train.zip -d train

# START

In [8]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from sklearn.preprocessing import LabelEncoder

In [152]:
train_df = pd.read_csv('train.csv')
train_df['df_source'] = 'train.csv'

test_df = pd.read_csv('test.csv')
test_df['df_source'] = 'test.csv'

total_df = pd.concat([train_df, test_df])

In [None]:
# Diplay null values of each column
total_df.isnull().sum()

In [153]:
# Manage titles
def get_title(name):
    if '.' in name:
        return name.split(',')[1].split('.')[0].strip()
    else:
        return 'Unknown'

def replace_titles(x): # Normalize the titles
    title = x['TitleCluster']
    if title in ['Capt', 'Col', 'Don', 'Jonkheer', 'Major', 'Rev', 'Sir']:
        return 'Mr'
    elif title in ['the Countess', 'Mme', 'Lady']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms']:
        return 'Miss'
    elif title =='Dr':
        if x['Sex']=='male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title

total_df['TitleCluster'] = total_df['Name'].map(lambda x: get_title(x))
total_df['TitleCluster'] = total_df.apply(replace_titles, axis=1)

In [154]:
# Manage tickets
total_df['TicketCluster'] = total_df['Ticket'].str[0]
total_df['TicketCluster'] = np.where(total_df["TicketCluster"].str.isdigit(), "X", total_df["TicketCluster"])

In [155]:
# Manage missing age
def fill_missing_age(x):
    age = x['Age']
    title = x['TitleCluster']
    if np.isnan(age):
      if title == 'Dona':
          return total_df[total_df['TitleCluster'] == 'Dona']['Age'].median()
      elif title == 'Master':
          return total_df[total_df['TitleCluster'] == 'Master']['Age'].median()
      elif title == 'Miss':
          return total_df[total_df['TitleCluster'] == 'Miss']['Age'].median()
      elif title == 'Mr':
        return total_df[total_df['TitleCluster'] == 'Mr']['Age'].median()
      elif title == 'Mrs':
        return total_df[total_df['TitleCluster'] == 'Mrs']['Age'].median()
    else:
      return age

total_df['Age'] = total_df.apply(fill_missing_age, axis=1)

In [156]:
# Manage missing Embarked
def fill_missing_embarked(x):
    embarked = x['Embarked']
    if embarked != embarked:
      return "X"
    else:
      return embarked

total_df['Embarked'] = total_df.apply(fill_missing_embarked, axis=1)

In [157]:
# Manage Pclass / SibSp
def family(x):
    pclass = x['Pclass']
    sibSp = x['SibSp']
    pcSib = sibSp + pclass
    if pcSib < 2:
        return 'Single'
    elif pcSib == 2:
        return 'Couple'
    elif pcSib <= 4:
        return 'InterM'
    else:
        return 'Large'
    
total_df['FamilyCluster'] = total_df.apply(family, axis=1)

In [158]:
total_df['AgeCluster'] = pd.cut(total_df['Age'], bins=[0, 5, 10, 20, 30, 40, 50, 60, 70, 81], include_lowest=True, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])
total_df['FareCluster'] = pd.cut(total_df['Fare'], bins=[0, 51, 101, 2000], include_lowest=True, labels=[0, 1, 2])

In [159]:
categorical_columns = ['df_source', 'Pclass', 'Sex', 'AgeCluster', 'FareCluster', 'Embarked', 'TicketCluster', 'TitleCluster', 'FamilyCluster']
cat_total_df = total_df[categorical_columns]

In [None]:
for column in categorical_columns:
  cat_total_df[column] = LabelEncoder().fit_transform(cat_total_df[column])
for column in categorical_columns:
  cat_total_df[column] = cat_total_df[column].astype('category')

In [161]:
train_total_df = cat_total_df[cat_total_df['df_source'] == 1]
test_total_df = cat_total_df[cat_total_df['df_source'] == 0]

In [162]:
def create_categorical_stack(input_df):
  stack = []
  for column in categorical_columns:
    temp_stack = input_df[column].cat.codes.values
    stack.append(temp_stack)
  return np.stack(stack, 1)

train_categorical_df = create_categorical_stack(train_total_df)
test_categorical_df = create_categorical_stack(test_total_df)

In [185]:
tensor_train = torch.tensor(train_categorical_df, dtype=torch.int64)
tensor_output = torch.tensor(train_df['Survived']).flatten()

tensor_test = torch.tensor(test_categorical_df, dtype=torch.int64)

In [193]:
total_records_train = len(train_total_df)
test_records_train = int(total_records_train * 0.2)

tensor_train_data = tensor_train[:total_records_train-test_records_train]
tensor_train_output = tensor_output[:total_records_train-test_records_train]

tensor_test_data = tensor_train[total_records_train-test_records_train:total_records_train]
tensor_test_output = tensor_output[total_records_train-test_records_train:total_records_train]

In [189]:
categorical_columns_size = [len(cat_total_df[column].astype('category').cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_columns_size]

In [190]:
class Model(nn.Module):
  def __init__(self, embedding_sizes):
    super().__init__()
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
    n_emb = sum(e.embedding_dim for e in self.embeddings)
    self.lin1 = nn.Linear(n_emb, 200)
    self.lin2 = nn.Linear(200, 70)
    self.lin3 = nn.Linear(70, 2)
    self.bn1 = nn.BatchNorm1d(n_emb)
    self.bn2 = nn.BatchNorm1d(200)
    self.bn3 = nn.BatchNorm1d(70)
    self.emb_drop = nn.Dropout(0.6)
    self.drops = nn.Dropout(0.3)
    self.sig = nn.Sigmoid()
  def forward(self, x_cat):
    x = [e(x_cat[:,i]) for i, e in enumerate(self.embeddings)]
    x = torch.cat(x, 1)
    x = self.emb_drop(x)
    x = self.bn1(x)
    x = self.lin1(x)
    x = self.drops(x)
    x = self.bn2(x)
    x = F.relu(self.lin2(x))
    x = self.drops(x)
    x = self.bn3(x)
    x = self.lin3(x)
    x = self.sig(x)
    return x

In [191]:
model = Model(categorical_embedding_sizes)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [192]:
epochs = 300
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(tensor_train_data)
    single_loss = loss_function(y_pred, tensor_train_output)
    aggregated_losses.append(single_loss)
    if i%200 == 1:
        print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

epoch: 1	loss: 0.7078914642333984
epoch: 201	loss: 0.5226251482963562
epoch: 300	loss: 0.513256311416626


In [194]:
with torch.no_grad():
    y_val = model(tensor_test_data)
    loss = loss_function(y_val, tensor_test_output)
print("Loss: " + str(loss))

Loss: tensor(0.4745)


In [195]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_val_sklearn = np.argmax(y_val, axis=1)

print("\nconfusion_matrix")
print(confusion_matrix(tensor_test_output, y_val_sklearn))
print("\nclassification_report")
print(classification_report(tensor_test_output, y_val_sklearn))
print("\naccuracy_score")
print(accuracy_score(tensor_test_output, y_val_sklearn))


confusion_matrix
[[102  13]
 [ 17  46]]

classification_report
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       115
           1       0.78      0.73      0.75        63

    accuracy                           0.83       178
   macro avg       0.82      0.81      0.81       178
weighted avg       0.83      0.83      0.83       178


accuracy_score
0.8314606741573034


In [197]:
# Make predictions
with torch.no_grad():
    y_pred = model(tensor_test)
print("Loss: " + str(loss))

Loss: tensor(0.4745)


In [None]:
values, labels = torch.max(y_pred, 1)
survived = labels.data.numpy()
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': survived})
submission_df.head(20)

In [538]:
from google.colab import files
submission_df.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>