In [None]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

import copy

In [None]:
# Load data sets
X_train_full = pd.read_csv("../input/spaceship-titanic/train.csv", index_col="PassengerId")
X_test_full = pd.read_csv("../input/spaceship-titanic/test.csv", index_col="PassengerId")

X_train_full.head()

In [None]:
# Assumption 1: When in cryo sleep, you will not use any facilities like ShoppingMall, RoomService etc.
# All missing values will be replaced with 0.0.
# Assumption 2: If no facilities have been used, CrySleep-NaN will be changed to True.
# Assumption 3: If food is consumed or roomservice has been used, CryoSleep-NaN will be changed to False
def cryosleep(df):
    df['ShoppingMall'] = np.where((df['ShoppingMall'].isnull()) & (df['CryoSleep'] == True), 0.0, df['ShoppingMall'])
    df['RoomService'] = np.where((df['RoomService'].isnull()) & (df['CryoSleep'] == True), 0.0, df['RoomService'])
    df['FoodCourt'] = np.where((df['FoodCourt'].isnull()) & (df['CryoSleep'] == True), 0.0, df['FoodCourt'])
    df['Spa'] = np.where((df['Spa'].isnull()) & (df['CryoSleep'] == True), 0.0, df['Spa'])
    df['VRDeck'] = np.where((df['VRDeck'].isnull()) & (df['CryoSleep'] == True), 0.0, df['VRDeck'])
    df['CryoSleep'] = np.where((df['CryoSleep'].isnull()) & ((df['RoomService'] == 0.0) & (df['FoodCourt'] == 0.0) & (df['ShoppingMall'] == 0.0) & (df['Spa'] == 0.0) & (df['VRDeck'] == 0.0)), True, df['CryoSleep'])
    df['CryoSleep'] = np.where((df['CryoSleep'].isnull()) & ((df['RoomService'] > 0.0) | (df['FoodCourt'] > 0.0) | (df['ShoppingMall'] > 0.0) | (df['Spa'] > 0.0) | (df['VRDeck'] > 0.0)), False, df['CryoSleep'])
    mask = X_train_full[X_train_full["CryoSleep"].isnull()]
    mask = mask.fillna(0.0)
    return df

cryosleep(X_train_full)
cryosleep(X_test_full)

In [None]:
# Change type Boolean to Object and dropping the "Name" column
X_train_full = X_train_full.drop("Name", axis=1)
X_test_full = X_test_full.drop("Name", axis=1)

X_train_full["CryoSleep"] = X_train_full["CryoSleep"].map({False:"No", True:"Yes"})
X_train_full["VIP"] = X_train_full["VIP"].map({False:"No", True:"Yes"})
X_test_full["CryoSleep"] = X_test_full["CryoSleep"].map({False:"No", True:"Yes"})
X_test_full["VIP"] = X_test_full["VIP"].map({False:"No", True:"Yes"})

X_train_full.head()

In [None]:
# Cabin column is assumed decklevel/cabinno./Portside or Starboard. Cabin no. should not have any relation to the event, however the location might.
# Below function splits the "Cabin" up in 2 new columns --> Deck level & Portside or Starboard
# Divide NaN over Portside and Starboard 50/50
def deck_side(df):
    np.random.seed(48)
    df["Deck"] = df["Cabin"].str[0]
    df["Side"] = df["Cabin"].str[-1]
    data = np.random.choice(a = list(df["Side"].value_counts().index) ,p  = [0.5,0.5] , size =df["Side"].isnull().sum())
    fill = pd.DataFrame(index= df.index[df["Side"].isnull()] , data=data , columns =["Side"])
    df.fillna(fill, inplace=True)
    df = df.drop("Cabin", axis=1, inplace=True)
    return df

deck_side(X_train_full)
deck_side(X_test_full)

X_train_full

In [None]:
X_train_full[X_train_full.isnull().any(axis=1)]

In [None]:
# Dropping the rows with unknown label
X_train_full = X_train_full.dropna(axis=0, subset=["Transported"])
X = X_train_full.drop("Transported", axis=1)
y = X_train_full["Transported"]

In [None]:
# Create tensor from label
y = torch.zeros(len(X), dtype=torch.long)
y[X_train_full.Transported == False] = 0
y[X_train_full.Transported == True] = 1

In [None]:
# Identify numerical columns in features/data
col_numeric = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]

# Identify categorical columns based on cardinality below 10
categorical_cols = [cname for cname in X.columns if X[cname].nunique() < 10 and X[cname].dtype == "object"]

In [None]:
# Combine categorical with numerical columns
my_cols = col_numeric + categorical_cols
X = X[my_cols]
X_test = X_test_full[my_cols]

In [None]:
# Numerical columns will be scaled and imputed
numerical_transformer = Pipeline(steps=[
    ("scaler", MinMaxScaler()),
    ("imputer", SimpleImputer(strategy="mean"))
])

# Categorical columns will be imputed and dummy encoded.
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Numerical and categorical preprocessing will be combined.
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, col_numeric),
    ('cat', categorical_transformer, categorical_cols)
])

In [None]:
# Prepare data
X = preprocessor.fit_transform(X)
X_test = preprocessor.transform(X_test)

In [None]:
# Split data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=48)

In [None]:
# Convert data to tensors
X_train = torch.tensor(X_train).float()
X_valid = torch.tensor(X_valid).float()
X_test = torch.tensor(X_test).float()

In [None]:
# Convert into datasets
train_data = TensorDataset(X_train,y_train)
valid_data = TensorDataset(X_valid, y_valid)

# Translate to DataLoader objects
batchsize = 64
train_loader = DataLoader(train_data, batch_size=batchsize, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_data, batch_size=valid_data.tensors[0].shape[0])

In [None]:
# Create class for the model
def createSpaceTitanicNet():
    
    class titanicNet(nn.Module):
        def __init__(self):
            super().__init__()
            
            ## Input layer
            self.input = nn.Linear(X_train.shape[1], 300)
            
            ## Hidden layer
            self.fc1 = nn.Linear(300, 600)
            self.bnorm1 = nn.BatchNorm1d(300)
            self.fc2 = nn.Linear(600, 600)
            self.bnorm2 = nn.BatchNorm1d(600)
            self.fc3 = nn.Linear(600, 200)
            self.bnorm3 = nn.BatchNorm1d(600)
            
            ## Output layer
            self.output = nn.Linear(200, 2)
        
        ## Forward pass
        def forward(self, x):
            x = F.relu(self.input(x))
            x = F.dropout(x, p=0.6)
            x = self.bnorm1(x)
            x = F.relu(self.fc1(x))
            x = F.dropout(x, p=0.5)
            x = self.bnorm2(x)
            x = F.relu(self.fc2(x))
            x = F.dropout(x, p=0.4)
            x = self.bnorm3(x)
            x = F.relu(self.fc3(x))
            x = F.dropout(x, p=0.4)
            return self.output(x)
        
    ## Create the model instance
    net = titanicNet()
    
    ## Loss function
    lossfun = nn.CrossEntropyLoss()
    
    ## Optimizer
    optimizer = torch.optim.RMSprop(net.parameters(), lr=0.001)
    
    return net, lossfun, optimizer

In [None]:
# Function to train the model
def function2trainTheModel():
    
    # Create a dictionary for the best model
    theBestModel = {"Accuracy":0,"net":None}
    
    # Number of epochs
    numepochs = 400
    
    # Create a new model
    net, lossfun, optimizer = createSpaceTitanicNet()
    
    # Initialise losses
    losses = torch.zeros(numepochs)
    trainAcc = []
    validAcc = []
    loops = 0
    
    # Loop over epochs
    for epochi in range(numepochs):
        
        ## Switch on training mode
        net.train()
        
        ## Loop over training batch data
        batchAcc = []
        batchLoss = []

        for X, y in train_loader:
            
            ### Forward pass
            yHat = net(X)
            loss = lossfun(yHat,y)
            
            ### Back propagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            ### Batch Loss
            batchLoss.append(loss.item())
            
            ### Compute Accuracy
            batchAcc.append(100 * torch.mean((torch.argmax(yHat,axis=1) == y).float()))
        
        ## Training accuracy
        trainAcc.append(np.mean(batchAcc))
        
        ## Average losses across batches
        losses[epochi] = np.mean(batchLoss)
        
        ## Test accuracy
        net.eval()
        X, y = next(iter(valid_loader))
        with torch.no_grad():
            yHat = net(X)
        
        ## Test accuracy
        validAcc.append(100 * torch.mean((torch.argmax(yHat,axis=1) == y).float()))
        
        if validAcc[-1] > theBestModel["Accuracy"]:
            theBestModel["Accuracy"] = validAcc[-1].item()
            theBestModel["net"] = copy.deepcopy(net.state_dict())
        
        ## End of epochs
        loops += 1
        print(f"Epoch [{loops}/{numepochs}]",end='\r')
        
    # Function output
    return trainAcc, validAcc, losses, theBestModel

In [None]:
# Train the model
trainAcc, validAcc, losses, theBestModel = function2trainTheModel()

In [None]:
# Visualize losses and accuracy
fig, ax = plt.subplots(1,2,figsize=(16,5))

ax[0].plot(losses.detach())
ax[0].set_ylabel("Loss")
ax[0].set_xlabel("Epoch")
ax[0].set_title("Losses")
ax[0].grid()

ax[1].plot(trainAcc,label="Train")
ax[1].plot(validAcc,label="Validation")
ax[1].set_ylabel("Accuracy (%)")
ax[1].set_xlabel("Epoch")
ax[1].set_ylim([72,85])
ax[1].set_title(f"Validation Accuracy: {theBestModel['Accuracy']:.2f}")
ax[1].legend()
ax[1].grid()

plt.show()

In [None]:
# Re-create the best performing model
bestnet = createSpaceTitanicNet()[0]
bestnet.load_state_dict(theBestModel["net"])

In [None]:
# Create predictions on the test data
preds = bestnet(X_test)

predictions = []
for p in preds.detach().numpy():
    predictions.append(np.argmax(p))

In [None]:
# Create output submission file
output = pd.DataFrame({'PassengerId': X_test_full.index,
                       'Transported': predictions})
output["Transported"] = output["Transported"].map({0:False,1:True})

In [None]:
output.head()

In [None]:
output.to_csv('submission.csv', index=False)