In [51]:
import pandas as pd
import torch

In [52]:
# Import train and test set
train = pd.read_csv('titanic-dataset/train.csv')
test = pd.read_csv('titanic-dataset/test.csv')

In [53]:
# Get target values
y = train['Survived']

# Drop the Survived column and remove it from further calculations
train.drop(columns=['Survived'], inplace=True)

In [56]:
display(train.shape)
display(test.shape)
display(y.shape)

(891, 11)

(418, 11)

(891,)

### NaN values

In [57]:
# Check for NaN values
train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [58]:
# Use mode() function to get the most frequent value
modes = train.mode().iloc[0]
modes

PassengerId                      1
Pclass                         3.0
Name           Abbing, Mr. Anthony
Sex                           male
Age                           24.0
SibSp                          0.0
Parch                          0.0
Ticket                        1601
Fare                          8.05
Cabin                      B96 B98
Embarked                         S
Name: 0, dtype: object

In [59]:
# Use fillna with most frequent values
train.fillna(modes, inplace=True)

In [60]:
# Check for NaN values
train.isna().sum().sum()

0

Get rid of 'Name', 'Ticket' abd 'Cabin' variables
**NOTE**: We don't use those values because right now it is out of scope. However, information in those variables are crucial and very imortant! The best score in Kaggle on Titanic dataset is performed only on 'Name' variable! See this notebook for more information: https://www.kaggle.com/code/cdeotte/titanic-using-name-only-0-81818/notebook
**NOTE**: It's very common in tabular data to use *categorical embeddings*. This is on *TODO* list right after finishing this notebook.

In [61]:
train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

### Label encoding

In [62]:
# Explore which variables are categorical
train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.2500,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.9250,S
3,4,1,female,35.0,1,0,53.1000,S
4,5,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,887,2,male,27.0,0,0,13.0000,S
887,888,1,female,19.0,0,0,30.0000,S
888,889,3,female,24.0,1,2,23.4500,S
889,890,1,male,26.0,0,0,30.0000,C


In [63]:
# Categorical varaibles are Pclass, Sex, Embarked - use one-hot encoding for gender and label for embarked
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()

In [64]:
# Use one_hot_encode for gender
gender = one_hot_encoder.fit_transform(train[['Sex']])

# Merge train with encoded gender
train = train.merge(pd.DataFrame(gender.toarray(), columns=['Male', 'Female']), left_index=True, right_index=True)

# Drop unused column
train.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
train['Embarked'] = label_encoder.fit_transform(train['Embarked'])

# Print
train

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
0,1,3,22.0,1,0,7.2500,2,0.0,1.0
1,2,1,38.0,1,0,71.2833,0,1.0,0.0
2,3,3,26.0,0,0,7.9250,2,1.0,0.0
3,4,1,35.0,1,0,53.1000,2,1.0,0.0
4,5,3,35.0,0,0,8.0500,2,0.0,1.0
...,...,...,...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0000,2,0.0,1.0
887,888,1,19.0,0,0,30.0000,2,1.0,0.0
888,889,3,24.0,1,2,23.4500,2,1.0,0.0
889,890,1,26.0,0,0,30.0000,0,0.0,1.0


### Use the same preprocessing techniques for test set

In [65]:
test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [66]:
# Use one_hot_encode for gender
gender = one_hot_encoder.fit_transform(test[['Sex']])

# Merge train with encoded gender
test = test.merge(pd.DataFrame(gender.toarray(), columns=['Male', 'Female']), left_index=True, right_index=True)

# Drop unused column
test.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
test['Embarked'] = label_encoder.fit_transform(test['Embarked'])

# Print
test

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
0,892,3,34.5,0,0,7.8292,1,0.0,1.0
1,893,3,47.0,1,0,7.0000,2,1.0,0.0
2,894,2,62.0,0,0,9.6875,1,0.0,1.0
3,895,3,27.0,0,0,8.6625,2,0.0,1.0
4,896,3,22.0,1,1,12.2875,2,1.0,0.0
...,...,...,...,...,...,...,...,...,...
413,1305,3,,0,0,8.0500,2,0.0,1.0
414,1306,1,39.0,0,0,108.9000,0,1.0,0.0
415,1307,3,38.5,0,0,7.2500,2,0.0,1.0
416,1308,3,,0,0,8.0500,2,0.0,1.0


In [68]:
# Check whether the dimensions are correct
display(train.shape)
display(test.shape)

(891, 9)

(418, 9)

## Model

In [47]:
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [28]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [39]:
# Define the class
class TitanicNetwork(nn.Module):
    def __init__(self):
        super(TitanicNetwork, self).__init__()

        # Number of input features is 9
        self.layer_in = nn.Linear(9, 36)
        self.layer_2 = nn.Linear(36, 36)
        self.layer_out = nn.Linear(36,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(36)

    def forward(self, inputs):
        x = self.relu(self.layer_in(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.dropout(x)
        x = self.layer_out(x)

        return x

**Note**: that we did not use the Sigmoid activation in our final layer during training. That’s because, we use the nn.BCEWithLogitsLoss() loss function which automatically applies the Sigmoid activation.

In [42]:
# Initialize the model
model = TitanicNetwork()

# Specify the device type responsible to load model into memory
model.to(device)

TitanicNetwork(
  (layer_in): Linear(in_features=9, out_features=36, bias=True)
  (layer_2): Linear(in_features=36, out_features=36, bias=True)
  (layer_out): Linear(in_features=36, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(36, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [43]:
print(model)

TitanicNetwork(
  (layer_in): Linear(in_features=9, out_features=36, bias=True)
  (layer_2): Linear(in_features=36, out_features=36, bias=True)
  (layer_out): Linear(in_features=36, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(36, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [44]:
print(list(model.named_parameters()))

[('layer_in.weight', Parameter containing:
tensor([[ 0.0557, -0.2613,  0.0586,  0.1018, -0.1719,  0.3269,  0.2270,  0.1842,
          0.1527],
        [-0.0396, -0.0199,  0.0333,  0.2156,  0.1851,  0.0743,  0.1887, -0.1132,
          0.0048],
        [ 0.2750, -0.3151,  0.2268, -0.1052,  0.1912,  0.3188, -0.2516,  0.1852,
          0.0044],
        [-0.3325, -0.2183, -0.1954, -0.3271, -0.1158,  0.1129,  0.2384, -0.1345,
         -0.0847],
        [-0.2663, -0.0976,  0.0794, -0.0964,  0.2563, -0.2181, -0.0504, -0.0982,
          0.1203],
        [-0.2882,  0.1759, -0.2677,  0.2191,  0.0739, -0.2554, -0.1136,  0.2131,
         -0.3157],
        [-0.2592, -0.0279, -0.1885,  0.0361, -0.2181,  0.1029,  0.3166,  0.1213,
         -0.2273],
        [-0.3153, -0.2160, -0.2170, -0.1859,  0.0770,  0.2835,  0.2916, -0.1762,
         -0.0927],
        [-0.0537,  0.1690, -0.2783,  0.2494,  0.2160,  0.2826,  0.1458, -0.0816,
         -0.1394],
        [ 0.0402,  0.3094,  0.2060, -0.1390, -0.2516,  0.

In [41]:
# Init loss function (Binary Cross Entropy Loss) (we assume that target is equally distributed)
loss_function = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

### DataLoaders and Dataset

Before training, we should implement custom Dataset class

In [87]:
class TitanicTrainDataset(Dataset):

    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        """ Return the single observation, including both independent and dependent variable """
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(index, torch.Tensor):
            index = index.tolist()

        return self.X_data[index], self.y_data[index]
        # return [self.X_data.iloc[index].values, self.y_data[index]]

    def __len__(self):
        """ Return the number of rows from tabular data """
        return len(self.X_data)

In [91]:
class TitanicTestDataset(Dataset):

    def __init__(self, X_data):
        self.X_data = X_data

    def __getitem__(self, index):
        """ Return the single observation, including only independent variable """
        if isinstance(index, torch.Tensor):
            index = index.tolist()

        return self.X_data[index]

    def __len__(self):
        """ Return the number of rows from tabular data """
        return len(self.X_data)

In [92]:
train_data = TitanicTrainDataset(torch.Tensor(train.values), torch.Tensor(y.values))
test_data = TitanicTestDataset(torch.Tensor(test.values))

In [93]:
# Test len method
train_data.__len__()

891

In [94]:
# Test getitem method
train_data.__getitem__(1)

(tensor([ 2.0000,  1.0000, 38.0000,  1.0000,  0.0000, 71.2833,  0.0000,  1.0000,
          0.0000]),
 tensor(1.))

In [95]:
# Initialize dataloaders
BATCH_SIZE_TRAIN = 64
BATCH_SIZE_TEST = 1
train_dataloader = DataLoader(dataset=train, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
test_dataloader = DataLoader(dataset=test, batch_size=BATCH_SIZE_TEST, shuffle=False)

### Train

In [None]:
# Init dataloaders
train_loader = DataFrame