In [439]:
import pandas as pd
import torch

In [440]:
# Import train and test set
train = pd.read_csv('titanic-dataset/train.csv')
test = pd.read_csv('titanic-dataset/test.csv')

In [441]:
# Get target values
y = train['Survived']

# Drop the Survived column and remove it from further calculations
train.drop(columns=['Survived'], inplace=True)

In [442]:
display(train.shape)
display(test.shape)
display(y.shape)

(891, 11)

(418, 11)

(891,)

### NaN values

In [443]:
# Check for NaN values
train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [444]:
# Use mode() function to get the most frequent value
modes = train.mode().iloc[0]
modes

PassengerId                      1
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [445]:
# Use fillna with most frequent values
train.fillna(modes, inplace=True)

In [446]:
# Check for NaN values
train.isna().sum().sum()

0

Get rid of 'Name', 'Ticket' abd 'Cabin' variables
**NOTE**: We don't use those values because right now it is out of scope. However, information in those variables are crucial and very imortant! The best score in Kaggle on Titanic dataset is performed only on 'Name' variable! See this notebook for more information: https://www.kaggle.com/code/cdeotte/titanic-using-name-only-0-81818/notebook
**NOTE**: It's very common in tabular data to use *categorical embeddings*. This is on *TODO* list right after finishing this notebook.

In [447]:
train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

### Label encoding

In [448]:
# Explore which variables are categorical
train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.2500,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.9250,S
3,4,1,female,35.0,1,0,53.1000,S
4,5,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,887,2,male,27.0,0,0,13.0000,S
887,888,1,female,19.0,0,0,30.0000,S
888,889,3,female,24.0,1,2,23.4500,S
889,890,1,male,26.0,0,0,30.0000,C


In [449]:
# Categorical varaibles are Pclass, Sex, Embarked - use one-hot encoding for gender and label for embarked
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()

In [450]:
# Use one_hot_encode for gender
gender = one_hot_encoder.fit_transform(train[['Sex']])

# Merge train with encoded gender
train = train.merge(pd.DataFrame(gender.toarray(), columns=['Male', 'Female']), left_index=True, right_index=True)

# Drop unused column
train.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
train['Embarked'] = label_encoder.fit_transform(train['Embarked'])

# Print
train

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
0,1,3,22.0,1,0,7.2500,2,0.0,1.0
1,2,1,38.0,1,0,71.2833,0,1.0,0.0
2,3,3,26.0,0,0,7.9250,2,1.0,0.0
3,4,1,35.0,1,0,53.1000,2,1.0,0.0
4,5,3,35.0,0,0,8.0500,2,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000,2,0.0,1.0
887,888,1,19.0,0,0,30.0000,2,1.0,0.0
888,889,3,24.0,1,2,23.4500,2,1.0,0.0
889,890,1,26.0,0,0,30.0000,0,0.0,1.0


### Use the same preprocessing techniques for test set

In [451]:
test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [452]:
# Use one_hot_encode for gender
gender = one_hot_encoder.fit_transform(test[['Sex']])

# Merge train with encoded gender
test = test.merge(pd.DataFrame(gender.toarray(), columns=['Male', 'Female']), left_index=True, right_index=True)

# Drop unused column
test.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
test['Embarked'] = label_encoder.fit_transform(test['Embarked'])

# Print
test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
0,892,3,34.5,0,0,7.8292,1,0.0,1.0
1,893,3,47.0,1,0,7.0000,2,1.0,0.0
2,894,2,62.0,0,0,9.6875,1,0.0,1.0
3,895,3,27.0,0,0,8.6625,2,0.0,1.0
4,896,3,22.0,1,1,12.2875,2,1.0,0.0
...,...,...,...,...,...,...,...,...,...
413,1305,3,,0,0,8.0500,2,0.0,1.0
414,1306,1,39.0,0,0,108.9000,0,1.0,0.0
415,1307,3,38.5,0,0,7.2500,2,0.0,1.0
416,1308,3,,0,0,8.0500,2,0.0,1.0


In [453]:
# Check whether the dimensions are correct
display(train.shape)
display(test.shape)

(891, 9)

(418, 9)

## Model

In [454]:
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [455]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [456]:
n_features = train.shape[1]

# Define the class
class TitanicNetwork(nn.Module):
    def __init__(self):
        super(TitanicNetwork, self).__init__()

        # Number of input features is 9
        self.layer_in = nn.Linear(n_features, 63)
        self.layer_2 = nn.Linear(63, 63)
        self.layer_out = nn.Linear(63,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(63)
        self.batchnorm2 = nn.BatchNorm1d(63)

    def forward(self, inputs):
        x = self.relu(self.layer_in(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)

        return x

**Note**: that we did not use the Sigmoid activation in our final layer during training. That’s because, we use the nn.BCEWithLogitsLoss() loss function which automatically applies the Sigmoid activation.

In [457]:
# Initialize the model
model = TitanicNetwork()

# Specify the device type responsible to load model into memory
model.to(device)

TitanicNetwork(
  (layer_in): Linear(in_features=9, out_features=63, bias=True)
  (layer_2): Linear(in_features=63, out_features=63, bias=True)
  (layer_out): Linear(in_features=63, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [458]:
print(model)

TitanicNetwork(
  (layer_in): Linear(in_features=9, out_features=63, bias=True)
  (layer_2): Linear(in_features=63, out_features=63, bias=True)
  (layer_out): Linear(in_features=63, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [459]:
print(list(model.named_parameters()))

[('layer_in.weight', Parameter containing:
tensor([[ 0.1907, -0.1229, -0.2440,  0.1249,  0.2736, -0.2586, -0.1627,  0.0092,
          0.3040],
        [-0.1466,  0.2820,  0.1667,  0.0917,  0.2914,  0.0559,  0.2692, -0.1819,
          0.3111],
        [-0.0965, -0.1128,  0.2828, -0.1476, -0.2080,  0.2445,  0.2947,  0.1941,
         -0.1125],
        [-0.2062,  0.1308,  0.2769,  0.3117,  0.1527, -0.0103,  0.2860,  0.3032,
          0.0192],
        [ 0.3093,  0.0769, -0.1166,  0.1175,  0.2519,  0.2626, -0.0088,  0.2218,
          0.1520],
        [ 0.1173,  0.0526,  0.1217,  0.0748, -0.1409, -0.2385, -0.0616,  0.0581,
          0.0655],
        [ 0.0130, -0.0440,  0.1914, -0.2250,  0.1966, -0.0395, -0.2108,  0.1710,
         -0.1307],
        [ 0.1630,  0.0220,  0.0566, -0.1350,  0.0823, -0.1331,  0.0939,  0.2427,
          0.2522],
        [ 0.1909,  0.1631,  0.0541,  0.0383,  0.0873, -0.2937,  0.1262, -0.1916,
          0.1727],
        [-0.2571, -0.2144,  0.0708,  0.2895,  0.1530,  0.

In [460]:
# Init loss function (Binary Cross Entropy Loss) (we assume that target is equally distributed)
loss_function = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

### Train-test-split

In [461]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.33, random_state=42)

### DataLoaders and Dataset

Before training, we should implement custom Dataset class

In [462]:
class TitanicTrainDataset(Dataset):

    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        """ Return the single observation, including both independent and dependent variable """
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        # if isinstance(index, torch.Tensor):
        #     index = index.tolist()

        return self.X_data[index], self.y_data[index]
        # return [self.X_data.iloc[index].values, self.y_data[index]]

    def __len__(self):
        """ Return the number of rows from tabular data """
        return len(self.X_data)

In [463]:
class TitanicTestDataset(Dataset):

    def __init__(self, X_data):
        self.X_data = X_data

    def __getitem__(self, index):
        """ Return the single observation, including only independent variable """
        if isinstance(index, torch.Tensor):
            index = index.tolist()

        return self.X_data[index]

    def __len__(self):
        """ Return the number of rows from tabular data """
        return len(self.X_data)

In [464]:
train_data = TitanicTrainDataset(torch.Tensor(X_train.values), torch.Tensor(y_train.values))
test_data = TitanicTestDataset(torch.Tensor(X_test.values))

In [465]:
# Test len method
train_data.__len__()

596

In [466]:
y_train.value_counts()

0    374
1    222
Name: Survived, dtype: int64

In [467]:
# Test getitem method
train_data.__getitem__(1)

(tensor([719.0000,   3.0000,  24.0000,   0.0000,   0.0000,  15.5000,   1.0000,
           0.0000,   1.0000]),
 tensor(0.))

In [468]:
# Initialize dataloaders
BATCH_SIZE_TRAIN = 64
BATCH_SIZE_TEST = 1
train_dataloader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
test_dataloader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE_TEST, shuffle=False)

### Train

In [469]:
# Define binary accuracy function
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

In [470]:
EPOCHS = 50

# Prepare model for training (default state)
model.train()

for e in range(1, EPOCHS+1):

    # Init loss and acc per epoch to zero
    epoch_loss = 0
    epoch_acc = 0

    # For loop to get our data in batches from dataloader
    for X_batch, y_batch in train_dataloader:

        # Load batches into memory (device)
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Set .grad attribute of all tensors to zero (otherwise we would accumulate it with .backwards())
        optimizer.zero_grad()

        # Forward pass (use input data to make prediction)
        y_pred = model(X_batch)

        # Calculate loss based on prediction and true value
        loss = loss_function(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))

        # We backpropagate this error through the network
        # The gradient is calculated for tensors which requires_grad=True
        loss.backward()

        # Adjust each parameter (weight and bias) by its gradient stored in .grad
        optimizer.step()

        # Calculate loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_dataloader):.5f} | Acc: {epoch_acc/len(train_dataloader):.3f}')

Epoch 001: | Loss: 0.67724 | Acc: 61.600
Epoch 002: | Loss: 0.65114 | Acc: 62.300
Epoch 003: | Loss: 0.62826 | Acc: 67.400
Epoch 004: | Loss: 0.61366 | Acc: 65.300
Epoch 005: | Loss: 0.60292 | Acc: 69.800
Epoch 006: | Loss: 0.60986 | Acc: 68.000
Epoch 007: | Loss: 0.59767 | Acc: 69.600
Epoch 008: | Loss: 0.60296 | Acc: 69.600
Epoch 009: | Loss: 0.57120 | Acc: 72.200
Epoch 010: | Loss: 0.61025 | Acc: 67.600
Epoch 011: | Loss: 0.59133 | Acc: 70.300
Epoch 012: | Loss: 0.58224 | Acc: 70.500
Epoch 013: | Loss: 0.59920 | Acc: 68.500
Epoch 014: | Loss: 0.58636 | Acc: 69.800
Epoch 015: | Loss: 0.58668 | Acc: 67.900
Epoch 016: | Loss: 0.57719 | Acc: 69.500
Epoch 017: | Loss: 0.57444 | Acc: 69.500
Epoch 018: | Loss: 0.56106 | Acc: 72.100
Epoch 019: | Loss: 0.54820 | Acc: 72.800
Epoch 020: | Loss: 0.54852 | Acc: 71.200
Epoch 021: | Loss: 0.55163 | Acc: 72.800
Epoch 022: | Loss: 0.51948 | Acc: 73.600
Epoch 023: | Loss: 0.51147 | Acc: 76.600
Epoch 024: | Loss: 0.50416 | Acc: 75.200
Epoch 025: | Los

### Test

In [471]:
y_pred_list = []

# Prepare model for testing
model.eval()

# We don't want to perform backpropagation
with torch.no_grad():

    # Go through batches
    for X_batch in test_dataloader:

        # Load batch into memory
        X_batch = X_batch.to(device)

        # Get the result
        y_test_pred = model(X_batch)

        y_test_pred = torch.sigmoid(y_test_pred)

        # Round probabilities to 0 or 1
        y_pred_tag = torch.round(y_test_pred)

        # Convert tensor to numpy object
        y_pred_list.append(y_pred_tag.cpu().numpy())

# Flatten the list to use confusion matrix and classification report
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [472]:
from sklearn.metrics import confusion_matrix, classification_report

# Plot confusion matrix
confusion_matrix(y_test, y_pred_list)

array([[158,  17],
       [ 41,  79]])

In [473]:
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       175
           1       0.82      0.66      0.73       120

    accuracy                           0.80       295
   macro avg       0.81      0.78      0.79       295
weighted avg       0.81      0.80      0.80       295

