In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
train.info()

## Expanding Features
Some of the features like Cabin and PassengerID have more information about the passenger within their features. By expanding these features we can have more information about a passenger which could be important for training the ML model. 

In [None]:
train[["Deck", "Cabin_Num", "Side"]] = train.Cabin.str.split("/", expand=True)
test[["Deck", "Cabin_Num", "Side"]] = test.Cabin.str.split("/", expand=True)
train.head()

In [None]:
train[["Group", "Group_Num"]] = train.PassengerId.str.split("_", expand=True)
test[["Group", "Group_Num"]] = test.PassengerId.str.split("_", expand=True)
train.head()

## Replacing NaN values

I will first replace values using a simple method such as using the median or mode to replace values. But later might look into replacing values using unsupervised learning like clustering. 

Also fill in the test set's NaNs with the same values from the training data so there is no leakage of data.

In [None]:
for i in train.columns:
    if train[i].isna().sum() > 0:
        print(f"{i}: {train[i].isna().sum()}")

print("\n", "test NaNs", "\n")
for i in test.columns:
    if test[i].isna().sum() > 0:
        print(f"{i}: {test[i].isna().sum()}")

In [None]:
for i in train.columns:
    print(f"{i}: {train[i].nunique()}")

In [None]:
# Make a list of columns that only have a couple of values to replace with mode
mode_list = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Deck", "Side"]

# Replace these columns NaN values with the mode. 
for i in mode_list:
    train[i] = train[i].fillna(train[i].mode()[0])
    test[i] = test[i].fillna(train[i].mode()[0])     # Fill in the test with same values
    
for i in train.columns:
    if train[i].isna().sum() > 0:
        print(f"{i}: {train[i].isna().sum()}")

In [None]:
# Make a list of numeric columns to replace with the median
median_list = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]

for i in median_list:
    train[i] = train[i].fillna(train[i].median())
    test[i] = test[i].fillna(train[i].median())     # Fill in the test with same values
    
for i in train.columns:
    if train[i].isna().sum() > 0:
        print(f"{i}: {train[i].isna().sum()}")

In [None]:
# These last columns will have NaN values replaced with a value that indicates that there wasn't a value.
train["Cabin"] = train["Cabin"].fillna(f"{train.Deck}/-1/{train.Side}")
train["Name"] = train["Name"].fillna("No name listed")
train["Cabin_Num"] = train["Cabin_Num"].fillna("-1")

test["Cabin"] = test["Cabin"].fillna(f"{train.Deck}/-1/{train.Side}")
test["Name"] = test["Name"].fillna("No name listed")
test["Cabin_Num"] = test["Cabin_Num"].fillna("-1")

for i in train.columns:
    if train[i].isna().sum() > 0:
        print(f"{i}: {train[i].isna().sum()}")

## Simple Pipeline

Because this is a simple model, I will use a pipeline to change categorical features to numeric, scale the data, and make predictions. 



In [None]:
from sklearn.pipeline import Pipeline
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

oe = OrdinalEncoder()
scaler = StandardScaler()
logit = LogisticRegression()

pipe = Pipeline([("Encoder", oe), ("Scaler", scaler), ("Logistic Regression", logit)])

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop("Transported", axis=1)
y = train["Transported"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print(classification_report(y_test, y_pred))

## Pytorch model

Use a neural network for prediction.

In [None]:
train.shape

In [None]:
transform_pipe = Pipeline([("Encoder", oe), ("Scaler", scaler)])
transform_pipe.fit(X_train)
X_train_transform = transform_pipe.transform(X_train)
X_test_transform = transform_pipe.transform(X_test)

print(X_train_transform.shape, X_test_transform.shape)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


# Change the train and test data to type tensor. 
class trainData(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = x_data
        self.y_data = y_data
    
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.x_data)

train_data = trainData(torch.tensor(X_train_transform, dtype=torch.float, requires_grad=True), 
                       torch.tensor(y_train.to_numpy(), dtype=torch.float, requires_grad=True))

class testData(Dataset):
    def __init__(self, x_data):
        self.x_data = x_data
    
    def __getitem__(self, index):
        return self.x_data[index]
    
    def __len__(self):
        return len(self.x_data)

test_data = testData(torch.tensor(X_test_transform, dtype=torch.float))

In [None]:
# Set hyperparameters for neural network.
Epochs = 50
batch_size = 64
learning_rate = 3e-4

# Load the data into a data loader
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1)

In [None]:
# Create neural network for binary classification
class BinaryClass(nn.Module):
    def __init__(self):
        super(BinaryClass, self).__init__()
        self.fc1 = nn.Linear(18, 64)
        self.fc2 = nn.Linear(64, 32)
        self.result = nn.Linear(32, 1)
        
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(p=0.2)
        self.bn1 = nn.BatchNorm1d(64)
        self.bn2 = nn.BatchNorm1d(32)
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.drop(x)
        x = self.result(x)
        
        return x


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
model = BinaryClass()
model.to(device)
print(model)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [None]:
model.train()
for e in range(1, Epochs+1):
    epoch_loss = 0
    epoch_acc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = binary_acc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')

In [None]:
y_pred_list = []
model.eval()
with torch.no_grad():
    for X_batch in test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.cpu().numpy())

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]


In [None]:
confusion_matrix(y_test, y_pred_list)

In [None]:
print(classification_report(y_test, y_pred_list))

## Submit Predictions

Use this simple model to submit predictions and see how well it does. 

In [None]:
# y_real_pred = pipe.predict(test)

test_transform = transform_pipe.transform(test)
real_test_data = testData(torch.tensor(test_transform, dtype=torch.float))
real_test_loader = DataLoader(real_test_data, batch_size=1)
y_real_pred = []
model.eval()
with torch.no_grad():
    for X_batch in real_test_loader:
        X_batch = X_batch.to(device)
        y_test_pred = model(X_batch)
        y_test_pred = torch.sigmoid(y_test_pred)
        y_pred_tag = torch.round(y_test_pred)
        y_real_pred.append(y_pred_tag.cpu().numpy())

y_real_pred = [a.squeeze().tolist() for a in y_real_pred]


In [None]:
test.head()

In [None]:
y_real_pred = pd.Series(y_real_pred, name="Transported")
df_pred = pd.concat([test.PassengerId, y_real_pred], axis=1)
df_pred["Transported"] = df_pred["Transported"].replace({1.0: True, 0.0: False})
df_pred.head()

In [None]:
df_pred.to_csv("submission.csv", index=False)