In [526]:
import pandas as pd
import torch

In [527]:
# Import train and test set
train = pd.read_csv('titanic-dataset/train.csv')
test = pd.read_csv('titanic-dataset/test.csv')

In [528]:
# Get target values
y = train['Survived']

# Drop the Survived column and remove it from further calculations
train.drop(columns=['Survived'], inplace=True)

### Train-test-split

In [529]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.33, random_state=42)

### NaN values

In [530]:
# Check for NaN values
X_train.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            118
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          462
Embarked         1
dtype: int64

In [531]:
# Use mode() function to get the most frequent value
modes = X_train.mode().iloc[0]
modes

PassengerId                              1
Pclass                                 3.0
Name           Abbott, Mr. Rossmore Edward
Sex                                   male
Age                                   24.0
SibSp                                  0.0
Parch                                  0.0
Ticket                            CA. 2343
Fare                                  8.05
Cabin                          C23 C25 C27
Embarked                                 S
Name: 0, dtype: object

In [532]:
# Use fillna with most frequent values
X_train.fillna(modes, inplace=True)

In [533]:
# Check for NaN values
X_train.isna().sum().sum()

0

Get rid of 'Name', 'Ticket' abd 'Cabin' variables
**NOTE**: We don't use those values because right now it is out of scope. However, information in those variables are crucial and very imortant! The best score in Kaggle on Titanic dataset is performed only on 'Name' variable! See this notebook for more information: https://www.kaggle.com/code/cdeotte/titanic-using-name-only-0-81818/notebook
**NOTE**: It's very common in tabular data to use *categorical embeddings*. This is on *TODO* list right after finishing this notebook.

In [534]:
X_train.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

### Label encoding

In [535]:
# Explore which variables are categorical
X_train.reset_index(inplace=True)
X_train

Unnamed: 0,index,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,6,7,1,male,54.0,0,0,51.8625,S
1,718,719,3,male,24.0,0,0,15.5000,Q
2,685,686,2,male,25.0,1,2,41.5792,C
3,73,74,3,male,26.0,1,0,14.4542,C
4,882,883,3,female,22.0,0,0,10.5167,S
...,...,...,...,...,...,...,...,...,...
591,106,107,3,female,21.0,0,0,7.6500,S
592,270,271,1,male,24.0,0,0,31.0000,S
593,860,861,3,male,41.0,2,0,14.1083,S
594,435,436,1,female,14.0,1,2,120.0000,S


In [536]:
# Categorical varaibles are Pclass, Sex, Embarked - use one-hot encoding for gender and label for embarked
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()

In [537]:
# Use one_hot_encode for gender
gender = one_hot_encoder.fit_transform(X_train[['Sex']])
gender_df = pd.DataFrame(gender.toarray(), columns=['Male', 'Female'])

In [538]:
#Merge train with encoded gender
X_train = pd.concat([X_train, gender_df], axis=1)

# Drop unused column
X_train.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
X_train['Embarked'] = label_encoder.fit_transform(X_train['Embarked'])

# Set PassangerID as index
X_train.set_index('PassengerId', inplace=True)

# Print
X_train

Unnamed: 0_level_0,index,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7,6,1,54.0,0,0,51.8625,2,0.0,1.0
719,718,3,24.0,0,0,15.5000,1,0.0,1.0
686,685,2,25.0,1,2,41.5792,0,0.0,1.0
74,73,3,26.0,1,0,14.4542,0,0.0,1.0
883,882,3,22.0,0,0,10.5167,2,1.0,0.0
...,...,...,...,...,...,...,...,...,...
107,106,3,21.0,0,0,7.6500,2,1.0,0.0
271,270,1,24.0,0,0,31.0000,2,0.0,1.0
861,860,3,41.0,2,0,14.1083,2,0.0,1.0
436,435,1,14.0,1,2,120.0000,2,1.0,0.0


### Use the same preprocessing techniques for test set

In [539]:
# Use fillna with most frequent values
X_test.fillna(modes, inplace=True)

In [540]:
X_test.reset_index(inplace=True)

X_test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

In [541]:
# Use one_hot_encode for gender
gender_test = one_hot_encoder.transform(X_test[['Sex']])
gender_test_df = pd.DataFrame(gender_test.toarray(), columns=['Male', 'Female'])
gender_test_df

Unnamed: 0,Male,Female
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,1.0,0.0
4,1.0,0.0
...,...,...
290,0.0,1.0
291,0.0,1.0
292,1.0,0.0
293,1.0,0.0


In [542]:
# Merge train with encoded gender
X_test = pd.concat([X_test, gender_test_df], axis=1)

# Drop unused column
X_test.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
X_test['Embarked'] = label_encoder.transform(X_test['Embarked'])

# Set PassangerID as index
X_test.set_index('PassengerId', inplace=True)

# Print
X_test

Unnamed: 0_level_0,index,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
710,709,3,24.0,1,1,15.2458,0,0.0,1.0
440,439,2,31.0,0,0,10.5000,2,0.0,1.0
841,840,3,20.0,0,0,7.9250,2,0.0,1.0
721,720,2,6.0,0,1,33.0000,2,1.0,0.0
40,39,3,14.0,1,0,11.2417,0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
716,715,3,19.0,0,0,7.6500,2,0.0,1.0
526,525,3,40.5,0,0,7.7500,1,0.0,1.0
382,381,3,1.0,0,2,15.7417,0,1.0,0.0
141,140,3,24.0,0,2,15.2458,0,1.0,0.0


In [543]:
# Check whether the dimensions are correct
display(X_train.shape)
display(X_test.shape)

(596, 9)

(295, 9)

## Model

In [544]:
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [545]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [546]:
n_features = X_train.shape[1]

# Define the class
class TitanicNetwork(nn.Module):
    def __init__(self):
        super(TitanicNetwork, self).__init__()

        # Number of input features is 9
        self.layer_in = nn.Linear(n_features, 63)
        self.layer_2 = nn.Linear(63, 63)
        self.layer_out = nn.Linear(63,1)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.1)
        self.batchnorm1 = nn.BatchNorm1d(63)
        self.batchnorm2 = nn.BatchNorm1d(63)

    def forward(self, inputs):
        x = self.relu(self.layer_in(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.layer_out(x)

        return x

titanic_network = TitanicNetwork()

**Note**: that we did not use the Sigmoid activation in our final layer during training. That’s because, we use the nn.BCEWithLogitsLoss() loss function which automatically applies the Sigmoid activation.

In [547]:
# Initialize the model
model = TitanicNetwork()

# Specify the device type responsible to load model into memory
model.to(device)

TitanicNetwork(
  (layer_in): Linear(in_features=9, out_features=63, bias=True)
  (layer_2): Linear(in_features=63, out_features=63, bias=True)
  (layer_out): Linear(in_features=63, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [548]:
print(model)

TitanicNetwork(
  (layer_in): Linear(in_features=9, out_features=63, bias=True)
  (layer_2): Linear(in_features=63, out_features=63, bias=True)
  (layer_out): Linear(in_features=63, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.1, inplace=False)
  (batchnorm1): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batchnorm2): BatchNorm1d(63, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [549]:
print(list(model.named_parameters()))

[('layer_in.weight', Parameter containing:
tensor([[ 0.2511, -0.2858, -0.2811,  0.0361, -0.0139,  0.0239,  0.2622, -0.2875,
          0.0937],
        [ 0.2804, -0.1831,  0.0699, -0.0208,  0.0314, -0.1375,  0.1234, -0.0567,
         -0.3042],
        [ 0.0486,  0.0487, -0.2226,  0.0495,  0.0342,  0.1533, -0.2531,  0.1877,
         -0.0346],
        [-0.1577,  0.1173,  0.1827,  0.3192, -0.0918, -0.2573, -0.2096, -0.0716,
          0.0962],
        [ 0.2163,  0.2198,  0.1493,  0.0567, -0.1689,  0.0474,  0.0701,  0.0451,
          0.2111],
        [-0.1510,  0.3316, -0.1474,  0.1275, -0.1062, -0.1795, -0.2702, -0.0518,
          0.1759],
        [-0.1815, -0.1764,  0.0286,  0.1765, -0.3160,  0.1961,  0.0789, -0.2222,
          0.1565],
        [ 0.3032,  0.0152,  0.0519, -0.0879,  0.2113, -0.1496, -0.2094, -0.0422,
         -0.1987],
        [-0.2176, -0.1315,  0.0909, -0.0202, -0.2675, -0.0486, -0.2967, -0.0855,
          0.2501],
        [ 0.1660, -0.2773, -0.0044, -0.0480,  0.2530, -0.

In [550]:
# Init loss function (Binary Cross Entropy Loss) (we assume that target is equally distributed)
loss_function = nn.BCEWithLogitsLoss()

optimizer = optim.Adam(model.parameters(), lr=0.001)

### DataLoaders and Dataset

Before training, we should implement custom Dataset class

In [551]:
class TitanicTrainDataset(Dataset):

    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        """ Return the single observation, including both independent and dependent variable """
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        # if isinstance(index, torch.Tensor):
        #     index = index.tolist()

        return self.X_data[index], self.y_data[index]
        # return [self.X_data.iloc[index].values, self.y_data[index]]

    def __len__(self):
        """ Return the number of rows from tabular data """
        return len(self.X_data)

In [552]:
class TitanicTestDataset(Dataset):

    def __init__(self, X_data):
        self.X_data = X_data

    def __getitem__(self, index):
        """ Return the single observation, including only independent variable """
        if isinstance(index, torch.Tensor):
            index = index.tolist()

        return self.X_data[index]

    def __len__(self):
        """ Return the number of rows from tabular data """
        return len(self.X_data)

In [553]:
train_data = TitanicTrainDataset(torch.Tensor(X_train.values), torch.Tensor(y_train.values))
test_data = TitanicTestDataset(torch.Tensor(X_test.values))

In [554]:
# Test len method
train_data.__len__()

596

In [555]:
train.__len__()

891

In [556]:
y_train.value_counts()

0    374
1    222
Name: Survived, dtype: int64

In [557]:
# Test getitem method
train_data.__getitem__(1)

(tensor([718.0000,   3.0000,  24.0000,   0.0000,   0.0000,  15.5000,   1.0000,
           0.0000,   1.0000]),
 tensor(0.))

In [558]:
# Initialize dataloaders
BATCH_SIZE_TRAIN = 64
BATCH_SIZE_TEST = 1
train_dataloader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE_TRAIN, shuffle=True)
test_dataloader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE_TEST, shuffle=False)

### Train

In [559]:
# Define binary accuracy function
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)

    return acc

In [560]:
EPOCHS = 50

# Prepare model for training (default state)
model.train()

for e in range(1, EPOCHS+1):

    # Init loss and acc per epoch to zero
    epoch_loss = 0
    epoch_acc = 0

    # For loop to get our data in batches from dataloader
    for X_batch, y_batch in train_dataloader:

        # Load batches into memory (device)
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Set .grad attribute of all tensors to zero (otherwise we would accumulate it with .backwards())
        optimizer.zero_grad()

        # Forward pass (use input data to make prediction)
        y_pred = model(X_batch)

        # Calculate loss based on prediction and true value
        loss = loss_function(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))

        # We backpropagate this error through the network
        # The gradient is calculated for tensors which requires_grad=True
        loss.backward()

        # Adjust each parameter (weight and bias) by its gradient stored in .grad
        optimizer.step()

        # Calculate loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_dataloader):.5f} | Acc: {epoch_acc/len(train_dataloader):.3f}')

Epoch 001: | Loss: 0.72189 | Acc: 50.400
Epoch 002: | Loss: 0.65755 | Acc: 66.100
Epoch 003: | Loss: 0.64609 | Acc: 64.700
Epoch 004: | Loss: 0.63892 | Acc: 67.600
Epoch 005: | Loss: 0.62667 | Acc: 70.000
Epoch 006: | Loss: 0.63114 | Acc: 65.100
Epoch 007: | Loss: 0.62747 | Acc: 66.200
Epoch 008: | Loss: 0.61022 | Acc: 68.100
Epoch 009: | Loss: 0.60496 | Acc: 69.000
Epoch 010: | Loss: 0.59772 | Acc: 68.000
Epoch 011: | Loss: 0.59636 | Acc: 69.000
Epoch 012: | Loss: 0.58457 | Acc: 69.500
Epoch 013: | Loss: 0.59443 | Acc: 69.400
Epoch 014: | Loss: 0.59562 | Acc: 69.900
Epoch 015: | Loss: 0.61137 | Acc: 66.600
Epoch 016: | Loss: 0.57027 | Acc: 70.500
Epoch 017: | Loss: 0.58791 | Acc: 67.800
Epoch 018: | Loss: 0.57017 | Acc: 69.600
Epoch 019: | Loss: 0.55908 | Acc: 73.800
Epoch 020: | Loss: 0.51940 | Acc: 74.400
Epoch 021: | Loss: 0.47065 | Acc: 77.900
Epoch 022: | Loss: 0.48797 | Acc: 77.700
Epoch 023: | Loss: 0.47309 | Acc: 77.700
Epoch 024: | Loss: 0.46964 | Acc: 77.600
Epoch 025: | Los

### Test

In [561]:
y_pred_list = []

# Prepare model for testing
model.eval()

# We don't want to perform backpropagation
with torch.no_grad():

    # Go through batches
    for X_batch in test_dataloader:

        # Load batch into memory
        X_batch = X_batch.to(device)

        # Get the result
        y_test_pred = model(X_batch)

        y_test_pred = torch.sigmoid(y_test_pred)

        # Round probabilities to 0 or 1
        y_pred_tag = torch.round(y_test_pred)

        # Convert tensor to numpy object
        y_pred_list.append(y_pred_tag.cpu().numpy())

# Flatten the list to use confusion matrix and classification report
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [562]:
from sklearn.metrics import confusion_matrix, classification_report

# Plot confusion matrix
confusion_matrix(y_test, y_pred_list)

array([[139,  36],
       [ 24,  96]])

In [563]:
print(classification_report(y_test, y_pred_list))

              precision    recall  f1-score   support

           0       0.85      0.79      0.82       175
           1       0.73      0.80      0.76       120

    accuracy                           0.80       295
   macro avg       0.79      0.80      0.79       295
weighted avg       0.80      0.80      0.80       295



### Submit

### Preprocessing

In [564]:
# Use fillna with most frequent values
test.fillna(modes, inplace=True)
test.reset_index(inplace=True)

test.drop(columns=['Name', 'Ticket', 'Cabin'], inplace=True)

# Use one_hot_encode for gender
gender_test = one_hot_encoder.transform(test[['Sex']])
gender_test_df = pd.DataFrame(gender_test.toarray(), columns=['Male', 'Female'])

# Merge train with encoded gender
test = pd.concat([test, gender_test_df], axis=1)

# Drop unused column
test.drop(columns=['Sex'], inplace=True)

# Encode 'Embarked' with label encoder
test['Embarked'] = label_encoder.transform(test['Embarked'])

# Set PassangerID as index
test.set_index('PassengerId', inplace=True)

# Print
test

Unnamed: 0_level_0,index,Pclass,Age,SibSp,Parch,Fare,Embarked,Male,Female
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,0,3,34.5,0,0,7.8292,1,0.0,1.0
893,1,3,47.0,1,0,7.0000,2,1.0,0.0
894,2,2,62.0,0,0,9.6875,1,0.0,1.0
895,3,3,27.0,0,0,8.6625,2,0.0,1.0
896,4,3,22.0,1,1,12.2875,2,1.0,0.0
...,...,...,...,...,...,...,...,...,...
1305,413,3,24.0,0,0,8.0500,2,0.0,1.0
1306,414,1,39.0,0,0,108.9000,0,1.0,0.0
1307,415,3,38.5,0,0,7.2500,2,0.0,1.0
1308,416,3,24.0,0,0,8.0500,2,0.0,1.0


In [565]:
# Apply model on test data
sub_X = torch.tensor(test.values)

# Get the results
sub_y = titanic_network(sub_X.float())

# Get survived
survived = torch.heaviside(sub_y, values=torch.tensor([0.0]))

In [566]:
survived

tensor([[0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
      

In [567]:
test.reset_index(inplace=True)

In [568]:
test['Survived'] = survived.detach().numpy().astype(int)
sub_df = test[['PassengerId', 'Survived']]
sub_df.to_csv('submission/sub-pytorch.csv', index=False)

In [569]:
!head 'submission/sub-fastai.csv'

PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,1
899,0
900,1
