<a href="https://colab.research.google.com/github/robimalco/colab/blob/main/Titanic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files

In [None]:
files.upload()

In [4]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets list

In [None]:
!kaggle competitions download -c titanic

In [None]:
!mkdir train
!unzip train.zip -d train

# START

In [200]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from sklearn.preprocessing import LabelEncoder

In [272]:
train_df = pd.read_csv('train.csv')
train_df['df_source'] = 'train.csv'

test_df = pd.read_csv('test.csv')
test_df['df_source'] = 'test.csv'

total_df = pd.concat([train_df, test_df])

In [None]:
# Diplay null values of each column
total_df.isnull().sum()

In [274]:
# Manage names creating TitleCluster

Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Dona" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles(x):
    title = x['Name'].split(',')[1].split('.')[0].strip()
    return Title_Dictionary[title]
total_df['TitleCluster'] = total_df.apply(get_titles, axis=1)

In [275]:
# Manage missing age and create AgeCluster

def fill_missing_age(x):
    age = x['Age']
    if np.isnan(age):
      return total_df[
        (total_df['TitleCluster'] == x['TitleCluster']) &
        (total_df['Sex'] == x['Sex']) &
        (total_df['Pclass'] == x['Pclass'])
      ]['Age'].median()
    else:
      return age

total_df['Age'] = total_df.apply(fill_missing_age, axis=1)

def cluster_age(x):
    age = x['Age']
    if age < 5:
      return 0
    elif age < 10:
      return 1
    elif age < 20:
      return 2
    elif age < 30:
      return 3
    elif age < 40:
      return 4
    elif age < 50:
      return 5
    elif age < 60:
      return 6
    else:
      return 7
total_df['AgeCluster'] = total_df.apply(cluster_age, axis=1)

In [276]:
# Manage tickets

total_df['TicketCluster'] = total_df['Ticket'].str[0]
total_df['TicketCluster'] = np.where(total_df["TicketCluster"].str.isdigit(), "X", total_df["TicketCluster"])

In [277]:
# Manage missing embarked

def fill_missing_embarked(x):
    embarked = x['Embarked']
    if embarked != embarked:
      return "X"
    else:
      return embarked

total_df['Embarked'] = total_df.apply(fill_missing_embarked, axis=1)

In [278]:
# Manage family

def cluster_family(x):
    pcSib = x['SibSp'] + x['Pclass'] + 1
    if pcSib < 2:
        return 'Single'
    elif pcSib == 2:
        return 'Couple'
    elif pcSib <= 4:
        return 'InterM'
    else:
        return 'Large'
    
total_df['FamilyCluster'] = total_df.apply(family, axis=1)

In [279]:
# Manage fare

def fill_missing_fare(x):
  fare = x['Fare']
  if fare == 0:
      return total_df[
        (total_df['TitleCluster'] == x['TitleCluster']) &
        (total_df['Sex'] == x['Sex']) &
        (total_df['Pclass'] == x['Pclass'])
      ]['Fare'].median()
  else:
    return fare

total_df['Fare'] = total_df.apply(fill_missing_fare, axis=1)

def cluster_fare(x):
    fare = x['Fare']
    if fare < 51:
        return 0
    elif fare < 101:
        return 1
    else:
        return 2

total_df['FareCluster'] = total_df.apply(cluster_fare, axis=1)

In [280]:
categorical_columns = ['df_source', 'Pclass', 'Sex', 'Embarked', 'AgeCluster', 'FareCluster', 'TicketCluster', 'TitleCluster', 'FamilyCluster']
cat_total_df = total_df[categorical_columns]

In [282]:
pd.options.mode.chained_assignment = None
for column in categorical_columns:
  cat_total_df[column] = LabelEncoder().fit_transform(cat_total_df[column])
for column in categorical_columns:
  cat_total_df[column] = cat_total_df[column].astype('category')

In [284]:
train_total_df = cat_total_df[cat_total_df['df_source'] == 1]
test_total_df = cat_total_df[cat_total_df['df_source'] == 0]

In [285]:
def create_categorical_stack(input_df):
  stack = []
  for column in categorical_columns:
    temp_stack = input_df[column].cat.codes.values
    stack.append(temp_stack)
  return np.stack(stack, 1)

train_categorical_df = create_categorical_stack(train_total_df)
test_categorical_df = create_categorical_stack(test_total_df)

In [286]:
tensor_train = torch.tensor(train_categorical_df, dtype=torch.int64)
tensor_output = torch.tensor(train_df['Survived']).flatten()

tensor_test = torch.tensor(test_categorical_df, dtype=torch.int64)

In [287]:
total_records_train = len(train_total_df)
test_records_train = int(total_records_train * 0.2)

tensor_train_data = tensor_train[:total_records_train-test_records_train]
tensor_train_output = tensor_output[:total_records_train-test_records_train]

tensor_test_data = tensor_train[total_records_train-test_records_train:total_records_train]
tensor_test_output = tensor_output[total_records_train-test_records_train:total_records_train]

In [288]:
categorical_columns_size = [len(cat_total_df[column].astype('category').cat.categories) for column in categorical_columns]
categorical_embedding_sizes = [(col_size, min(50, (col_size+1)//2)) for col_size in categorical_columns_size]

In [321]:
class Model(nn.Module):
  def __init__(self, embedding_sizes):
    super().__init__()
    self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_sizes])
    n_emb = sum(e.embedding_dim for e in self.embeddings)
    self.lin1 = nn.Linear(n_emb, 200)
    self.lin2 = nn.Linear(200, 100)
    self.lin3 = nn.Linear(100, 50)
    self.lin4 = nn.Linear(50, 2)
    self.emb_drop = nn.Dropout(0.6)
    self.drops = nn.Dropout(0.1)
  def forward(self, x_cat):
    x = [e(x_cat[:,i]) for i, e in enumerate(self.embeddings)]
    x = torch.cat(x, 1)
    x = self.emb_drop(x)
    x = self.lin1(x)
    x = self.drops(x)
    x = F.relu(self.lin2(x))
    x = self.drops(x)
    x = F.relu(self.lin3(x))
    x = self.drops(x)
    x = self.lin4(x)
    return x

In [322]:
model = Model(categorical_embedding_sizes)
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [323]:
epochs = 300
aggregated_losses = []

for i in range(epochs):
    i += 1
    y_pred = model(tensor_train_data)
    single_loss = loss_function(y_pred, tensor_train_output)
    aggregated_losses.append(single_loss)
    if i%200 == 1:
        print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

    optimizer.zero_grad()
    single_loss.backward()
    optimizer.step()

print("epoch: " + str(i) + "\tloss: " + str(single_loss.item()))

epoch: 1	loss: 0.6844605803489685
epoch: 201	loss: 0.48192352056503296
epoch: 300	loss: 0.4595804810523987


In [324]:
with torch.no_grad():
    y_val = model(tensor_test_data)
    loss = loss_function(y_val, tensor_test_output)
print("Loss: " + str(loss))

Loss: tensor(0.4019)


In [325]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_val_sklearn = np.argmax(y_val, axis=1)

print("\nconfusion_matrix")
print(confusion_matrix(tensor_test_output, y_val_sklearn))
print("\nclassification_report")
print(classification_report(tensor_test_output, y_val_sklearn))
print("\naccuracy_score")
print(accuracy_score(tensor_test_output, y_val_sklearn))


confusion_matrix
[[106   9]
 [ 19  44]]

classification_report
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       115
           1       0.83      0.70      0.76        63

    accuracy                           0.84       178
   macro avg       0.84      0.81      0.82       178
weighted avg       0.84      0.84      0.84       178


accuracy_score
0.8426966292134831


In [326]:
# Make predictions
with torch.no_grad():
    y_pred = model(tensor_test)
print("Loss: " + str(loss))

Loss: tensor(0.4019)


In [327]:
values, labels = torch.max(y_pred, 1)
survived = labels.data.numpy()
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': survived})
# submission_df.head(20)

In [328]:
t_not_survived = train_df[train_df['Survived']==0].count()[0]
t_survived = train_df[train_df['Survived']==1].count()[0]
s_not_survived = submission_df[submission_df['Survived']==0].count()[0]
s_survived = submission_df[submission_df['Survived']==1].count()[0]
print("Proportion survived train_df: ", str(round(t_survived/(t_survived + t_not_survived), 2)))
print("Proportion survived submission_df: ", str(round(s_survived/(s_survived + s_not_survived), 2)))

Proportion survived train_df:  0.38
Proportion survived submission_df:  0.35


In [329]:
from google.colab import files
submission_df.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>