In [None]:
!pip install tab2img
!pip install gender_guesser

In [None]:
import numpy as np 
import pandas as pd
from PIL import Image
from dateutil.parser import parse
from typing import List
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import optim
import torch.nn as nn
import sklearn.preprocessing as preprocessing

# Introduction

This is a fun competition and I want to see how converting the data to an image format performs. I enjoy this approach and I do believe there could be an advantage to this idea if I can develop it further. I generally find this is a middle of the pack prediction method but perhaps there will be surprise in store for us.

# Feature Engineering

The aim here is to deal with the object columns and convert them to different numerical types so that we can apply ML models to them. I'm going to rush through this as if you want a more detailed approach I have more at my other notebook: https://www.kaggle.com/taranmarley/feature-engineering-eda-and-lightgbm

In [None]:
df = pd.read_csv("../input/spaceship-titanic/train.csv")
test_df = pd.read_csv("../input/spaceship-titanic/test.csv")

In [None]:
df["Transported"] = df["Transported"].astype(int)

In [None]:
def seperate_passenger_id(df_temp):
    passenger_class = []
    for idx, row in df_temp.iterrows():
        passengerid = str(row["PassengerId"])
        if "_" in passengerid:
            passenger_class.append(int(passengerid.split("_")[1]))
        else:
            passenger_class.append(0)
    df_temp["Passenger Class"] = passenger_class
    return df_temp
df = seperate_passenger_id(df)
test_df = seperate_passenger_id(test_df)

**Cabin Details Seperated**

In [None]:
def seperate_cabin(df_temp):
    letters = []
    numbers = []
    final_letters = []
    for idx, row in df_temp.iterrows():
        cabin = str(row["Cabin"])
        if "/" in cabin:
            letters.append(cabin.split("/")[0])
            numbers.append(cabin.split("/")[1])
            final_letters.append(cabin.split("/")[2])
        else:
            letters.append(None)
            numbers.append(-1)
            final_letters.append(None)
    df_temp["letters"] = letters
    df_temp["numbers"] = numbers
    df_temp["final_letters"] = final_letters
    return df_temp
df = seperate_cabin(df)
test_df = seperate_cabin(test_df)
df = df.drop(columns="Cabin")
test_df = test_df.drop(columns="Cabin")

In [None]:
df["numbers"] = pd.to_numeric(df["numbers"], errors = 'ignore')
test_df["numbers"] = pd.to_numeric(test_df["numbers"], errors = 'ignore')

**Gender from Name**

In [None]:
import gender_guesser.detector as gender
def predict_gender(df):
    d = gender.Detector()
    gender_predicted = []
    for idx, row in df.iterrows():
        name = str(row["Name"])
        if " " in name:
            predicted = d.get_gender(name.split(" ")[0])
            if predicted == "mostly_male":
                predicted = "male"
            elif predicted == "mostly_female":
                predicted = "female"
            gender_predicted.append(predicted)
        else:
            gender_predicted.append("unknown")
    df["gender"] = gender_predicted
    df = pd.get_dummies(df, columns = ["gender"])
    return df

df = predict_gender(df)
test_df = predict_gender(test_df)

**Change the last names**

In [None]:
def last_names(df):
    Last_Names = []
    for idx, row in df.iterrows():
        name = str(row["Name"])
        if " " in name:
            Last_Names.append(name.split(" ")[-1])
        else:
            Last_Names.append(None)
    df["Name"] = Last_Names
    return df
df = last_names(df)
test_df = last_names(test_df)

**Count Number of Family Members On Ship**

In [None]:
df_temp = pd.concat([df.copy(), test_df.copy()], ignore_index=True)
df_temp['Num_Family_Members'] = df_temp.groupby(['Name'])['PassengerId'].transform('nunique')
df['Num_Family_Members'] = df_temp['Num_Family_Members'][:8693].values
test_df['Num_Family_Members'] = df_temp['Num_Family_Members'][8693:].values

**Remove the PassengerId Column**

In [None]:
df = df.drop(columns=["PassengerId"])
test_df = test_df.drop(columns=["PassengerId"])

**Encode columns to one hot encoding**

In [None]:
# df.drop(columns="Name", inplace=True)
# test_df.drop(columns="Name", inplace=True)

In [None]:
def encode_columns(df, columns, test_df = None):
    for col in columns:
        le = preprocessing.LabelEncoder()
        le.fit(df[col].astype(str))
        if len(le.classes_) < 30:
            df = pd.get_dummies(df, columns = [col])
            if test_df is not None:
                test_df = pd.get_dummies(test_df, columns = [col])
        else:
            check_col = df.copy()[col]
            df[col] = le.transform(df[col].astype(str))
            if test_df is not None:
                #Clean out unseen labels
                inputs = []
                for idx, row in test_df.iterrows():
                    if row[col] in pd.unique(check_col):
                        inputs.append(row[col])
                    else:
                        inputs.append(None)
                test_df[col] = inputs
                test_df[col] = le.transform(test_df[col].astype(str))
    return df, test_df
#encode_columns(df, ["HomePlanet", "CryoSleep", "Destination", "VIP", "Name", "letters", "final_letters"], test_df)
df, test_df = encode_columns(df, ["HomePlanet", "CryoSleep", "Destination", "VIP", "Name", "letters", "final_letters"], test_df)

**Fill in NaNs**

I will also record where there was a NaN in case that proves useful

In [None]:
Age_Recorded = []
def fillna_create_column(df_temp, columns, value = 0):
    """
    Fill na of provided columns and create columns to signify they weren't there
    """
    for col in columns:
        temp_col = []
        for idx, row in df_temp.iterrows():
            if row[col] != row[col]:
                temp_col.append(0)
            else:
                temp_col.append(1)
        df_temp[col + "_exists"] = temp_col
        df_temp[col] = df_temp[col].fillna(0)
    return(df_temp)
df = fillna_create_column(df, ["Age","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck","Num_Family_Members"])
test_df = fillna_create_column(test_df, ["Age","RoomService","FoodCourt","ShoppingMall","Spa","VRDeck","Num_Family_Members"])

Check that there are NaNs still in data

In [None]:
def detect_NaNs(df_temp): 
    print('NaNs in data: ', df_temp.isnull().sum().sum())
    count_nulls = df_temp.isnull().sum().sum()
    if count_nulls > 0:
        print('******')
        for col in df_temp.columns:
            print('NaNs in', col + ": ", df_temp[col].isnull().sum().sum())
        print('******')
    print('')
detect_NaNs(df)
detect_NaNs(test_df)

**Create Interactions**

In [None]:
import itertools
def create_interactions(df_temp, column_list):
    # Cross wise interactions
    for x in itertools.combinations(column_list, 2):
        df_temp[x[0]+"+"+x[1]] = df_temp[x[0]]+df_temp[x[1]]
    # Iterative Totals
    iterative_total = 0
    i = 0
    for j in (column_list):
        iterative_total = iterative_total + df_temp[j]
        if i > 0:
            df_temp["A" + str(i) + "_iter_score"] = iterative_total
        i = i + 1
    return df_temp
df = create_interactions(df, ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"])
test_df = create_interactions(test_df, ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"])

In [None]:
df["TotalSpend"] = df["A4_iter_score"]
df = df.drop(columns="A4_iter_score")
test_df["TotalSpend"] = test_df["A4_iter_score"]
test_df = test_df.drop(columns="A4_iter_score")

In [None]:
def spend_by_age(df_temp):
    spending_by_age = []
    for idx, row in df_temp.iterrows():
        if row["Age"] != 0:
            spending_by_age.append((row["TotalSpend"] / row["Age"]))
        else:
            spending_by_age.append(0)    
    return spending_by_age
df["spending_by_age"] = spend_by_age(df)
test_df["spending_by_age"] = spend_by_age(test_df)

In [None]:
def create_interactions_based_on_total(df_temp, column_list, total_col_name):
    """
    Determine ratio of columns based on a total
    """
    # Cross wise interactions
    for j in (column_list):
        df_temp[j + " per " + total_col_name] = df_temp[j] / df_temp[total_col_name]
        df_temp[j + " per " + total_col_name] = df_temp[j + " per " + total_col_name].replace([np.inf, -np.inf], np.nan)
        df_temp[j + " per " + total_col_name] = df_temp[j + " per " + total_col_name].fillna(0)
    
    return df_temp
df = create_interactions_based_on_total(df, ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"], "TotalSpend")
test_df = create_interactions_based_on_total(test_df, ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"], "TotalSpend")

In [None]:
df = create_interactions_based_on_total(df, ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"], "Age")
test_df = create_interactions_based_on_total(test_df, ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"], "Age")

**Create PCA Features**

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

df_temp = pd.concat([df.copy(), test_df.copy()], ignore_index=True)
y = df_temp["Transported"]
X = df_temp.drop(columns="Transported", axis=1)
X_scaled = MinMaxScaler().fit_transform(X)
pca = PCA(n_components=10)
X_p = pca.fit(X_scaled).transform(X_scaled)
for i in range(10):
    df["PCA_" + str(i)] = X_p[:8693,i]
    test_df["PCA_" + str(i)] = X_p[8693:,i]

**Create Polynomial Features**

In [None]:
from sklearn.preprocessing import PolynomialFeatures

def PolynomialFeatures_labeled(input_df,power, target_col, interaction_only=False):
    '''Basically this is a cover for the sklearn preprocessing function. 
    The problem with that function is if you give it a labeled dataframe, it ouputs an unlabeled dataframe with potentially
    a whole bunch of unlabeled columns. 

    Inputs:
    input_df = Your labeled pandas dataframe (list of x's not raised to any power) 
    power = what order polynomial you want variables up to. (use the same power as you want entered into pp.PolynomialFeatures(power) directly)

    Ouput:
    Output: This function relies on the powers_ matrix which is one of the preprocessing function's outputs to create logical labels and 
    outputs a labeled pandas dataframe   
    
    Heavily modified from: https://stackoverflow.com/users/3633522/afflatus
    '''
    # Remove target_col
    input_df = input_df.copy()
    if target_col is not None:
        target_col_saved = input_df[target_col]
        input_df.drop(columns=target_col, inplace=True)
    
    poly = PolynomialFeatures(power, interaction_only=interaction_only)
    output_nparray = poly.fit_transform(input_df)
    powers_nparray = poly.powers_

    input_feature_names = list(input_df.columns)
    target_feature_names = ["Constant Term"]
    for feature_distillation in powers_nparray[1:]:
        intermediary_label = ""
        final_label = ""
        for i in range(len(input_feature_names)):
            if feature_distillation[i] == 0:
                continue
            else:
                variable = input_feature_names[i]
                power = feature_distillation[i]
                intermediary_label = "%s^%d" % (variable,power)
                if final_label == "":         #If the final label isn't yet specified
                    final_label = intermediary_label
                else:
                    final_label = final_label + " x " + intermediary_label
        target_feature_names.append(final_label)
    output_df = pd.DataFrame(output_nparray, columns = target_feature_names)
    if target_col is not None:
        output_df[target_col] = target_col_saved
    return output_df

poly_df = PolynomialFeatures_labeled(df, 2, "Transported")
poly_test_df = PolynomialFeatures_labeled(test_df, 2, None)

**Split Train Test**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

train_ratio = 0.90

y = poly_df["Transported"]
poly_df = poly_df.drop(columns="Transported")
X = poly_df

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=1 - train_ratio, random_state = 0)

**Scale Train Test**

In [None]:
from sklearn.preprocessing import PolynomialFeatures

scaler = preprocessing.MinMaxScaler().fit(X_train)
scaled_poly = scaler.transform(X_train)
scaled_poly_val = scaler.transform(X_val)
scaled_poly_test = scaler.transform(poly_test_df)

# Create Image Dataset

**Convert tabular data to images**

In [None]:
from tab2img.converter import Tab2Img
model = Tab2Img()
train_images = model.fit_transform(scaled_poly, y_train.values)
val_images = model.transform(scaled_poly_val)
test_images = model.transform(scaled_poly_test)

**Visualize the Images**

In [None]:
fig,ax = plt.subplots(2,5)
for i in range(10):
    nparray = test_images[i].reshape(55,55)
    image = Image.fromarray(nparray * 255)
    ax[i%2][i//2].imshow(image)
fig.show()

fig,ax = plt.subplots(2,5)
for i in range(10):
    nparray = train_images[i].reshape(55,55)
    image = Image.fromarray(nparray * 255)
    ax[i%2][i//2].imshow(image)
fig.show()

**Create the Custom Dataset Class**

We need this to be able to load the image and label into the model we will create. So we will create a custom dataset to handle this.

In [None]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
  def __init__(self, X, y, BatchSize, transform):
    super().__init__()
    self.BatchSize = BatchSize
    self.y = y
    self.X = X
    self.transform = transform
    
  def num_of_batches(self):
    """
    Detect the total number of batches
    """
    return math.floor(len(self.list_IDs) / self.BatchSize)

  def __getitem__(self,idx):
    class_id = self.y[idx]
    img = self.transform(np.nan_to_num(self.X[idx]))
    return img, torch.tensor(class_id)

  def __len__(self):
    return len(self.X)

**Instantiate the Datasets**

We will form them into torch dataloaders to make the data easier to work with. 

In [None]:
from torch.utils.data import DataLoader
from torchvision import transforms

transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5])
            ])

dataset_stages = ['train', 'val', 'test']

batch_size = 32
image_datasets = {'train' : CustomDataset(train_images, y_train.values, batch_size, transform), 'val' : CustomDataset(val_images, y_val.values, batch_size, transform), 'test' : CustomDataset(test_images, range(0,len(test_df)), batch_size, transform)}
dataloaders = {'train' : DataLoader(image_datasets['train'], batch_size=image_datasets['train'].BatchSize, shuffle=True, num_workers=0), 
               'val' : DataLoader(image_datasets['val'], batch_size=image_datasets['val'].BatchSize, shuffle=True, num_workers=0), 
               'test' : DataLoader(image_datasets['test'], batch_size=image_datasets['test'].BatchSize, shuffle=False, num_workers=0)}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}

Check an image from the dataset

In [None]:
image = transforms.ToPILImage()(image_datasets['train'][1][0].cpu()).convert("RGB")
display(image)

# Training Neural Network

**Create Training Function**

In [None]:
import time

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train_model(model, criterion, optimizer, scheduler, num_epochs=10, early_stop_value=0, categorical=True):
    since = time.time()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            num_batches = 0
            outputs = None
            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                # Loading Bar
                if (phase == 'train'):
                    num_batches += 1
                    percentage_complete = ((num_batches * batch_size) / (dataset_sizes[phase])) * 100
                    percentage_complete = np.clip(percentage_complete, 0, 100)
                    print("{:0.2f}".format(percentage_complete), "% complete", end='\r')

                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()
                
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    labels = labels.type(torch.LongTensor)
                    labels = labels.to(device)
                    outputs = outputs.float().to(device)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        # TODO: try removal
                        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                if categorical:
                    predicted = torch.max(outputs.data, 1)[1] 
                    running_correct = (predicted == labels).sum()
                    running_corrects += running_correct
                else:
                    running_loss += loss.item() * inputs.size(0)
                    running_correct = 0
                    for i in  range(0,len(outputs)):
                        label = labels.unsqueeze(1).float()[i]
                        running_correct += abs(abs(outputs[i]) -  abs(label))
                    running_corrects += running_correct
                    
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            #epoch_acc = sum(epoch_acc) / len(epoch_acc)

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc.item()))
            # Early Stop
            if early_stop_value > 0:
                if phase == 'val':
                    val_accuracy = epoch_acc.item()
        if early_stop_value > 0 and val_accuracy > early_stop_value:
            print("*** EARLY STOP ***")
            break
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    return model

**Create Model**

In [None]:
from torchvision import models
from torch.optim import lr_scheduler

shufflenet = models.shufflenet_v2_x1_0()
shufflenet.conv1[0] = nn.Conv2d(1, 24, kernel_size=(2, 2), stride=(1, 1))
shufflenet.fc = nn.Linear(in_features=1024, out_features=2, bias=True)
model_ft = shufflenet

**Train Model**

In [None]:
criterion = nn.CrossEntropyLoss()

optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.01)

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.01)

model_ft = train_model(model_ft.to(device), criterion, optimizer_ft, exp_lr_scheduler, 16)

# Run on Test Set

In [None]:
predictions = []

outputs = None

for inputs, labels in dataloaders['test']:
    model_ft.eval()
    model_ft.eval()
    
    inputs = inputs.to(device)
    labels = labels.to(device)
    outputs = model_ft(inputs)
    
    for o in torch.max(outputs.data, 1)[1]:
        predictions.append(o.cpu().item())    

Convert back to boolean as required for submission

In [None]:
new_predictions = []
for p in predictions:
    if p == 0:
        new_predictions.append(False)
    else:
        new_predictions.append(True)

# Create Submission

In [None]:
submissiondf = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
submissiondf["Transported"] = new_predictions
submissiondf.head()

In [None]:
submissiondf.to_csv("submission.csv", index=False)

# Conclusion

These are much better results than I expected. A very good result from image recognition has been achieved here and to me implies this could be a very strong method going forward for this dataset. With all image recognition the disadvantage can be in interpretability, I do think this result is strongly driven by the polynomial features and that means to me that interactions can be a very strong path forward in this dataset and possibly power transforms should be explored. 