## Spaceship Titanic Prediction with DNN

In [None]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import StratifiedKFold
import numpy as np 
import tensorflow as tf
import pandas as pd
import math
from tensorflow import keras
import tensorflow_addons as tfa

## Import datasets

In [None]:
## Referring following notebooks with some modifications
## https://www.kaggle.com/max1mum/blasted-wormholes-detailed-eda-and-models
## https://www.kaggle.com/edwintyh/pycaret-spaceship-fe-catboost
def fill_missing(data):
    data['HomePlanet'].fillna('None', inplace=True)
    data['CryoSleep'].fillna(False, inplace=True)
    data['Cabin'].fillna('Unknown/0/Unknown', inplace=True)
    data['Destination'].fillna('None', inplace=True)
    data["Name"].fillna("Unknown Unknown", inplace=True)
    data['Age'].fillna(int(train['Age'].mode()), inplace=True)
    data['VIP'].fillna(False, inplace=True)
    for key in ['RoomService', "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]:
        data[key].fillna(data[key].median(), inplace=True)
    
def feature_engineering(data):
    bool_type = ['VIP', 'CryoSleep']
    data[bool_type] = data[bool_type].astype(int)
    data["Deck"] = data["Cabin"].apply(lambda item: str(item).split('/')[0])
    data["Num"] = data["Cabin"].apply(lambda item:  int(str(item).split('/')[1]))
    data["FirstName"]= data["Name"].apply(lambda item: item.split(" ")[0])
    data["LastName"]= data["Name"].apply(lambda item: item.split(" ")[1])
    
    data["Side"] = data["Cabin"].apply(lambda item: str(item).split('/')[2])
    data['TotalSpend'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck'] + 1e-8
    data['PctRoomService'] = data['RoomService']/data['TotalSpend']
    data['PctFoodCourt'] = data['FoodCourt']/data['TotalSpend']
    data['PctShoppingMall'] = data['ShoppingMall']/data['TotalSpend']
    data['PctSpa'] = data['Spa']/data['TotalSpend']
    data['PctVRDeck'] = data['VRDeck']/data['TotalSpend']
    data.pop("Cabin")
    data.pop("PassengerId")
    data.pop("Name")

In [None]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
train_targets = train.pop("Transported").astype(int)
test = pd.read_csv("../input/spaceship-titanic/test.csv")
data = pd.concat([train, test])

fill_missing(data)
feature_engineering(data)
for column in data.columns:
    if "int" in str(data[column].dtype):
        data[column] = data[column].astype(float)
train = data.iloc[0:len(train)]
test = data.iloc[len(train):]
train.head()

## Create Tensorflow Dataset

In [None]:
def preprocess(x, y):
    return ((x[0][0], x[0][1], x[0][2], x[0][3]), x[1]), y
def make_dataset(category_df, numeric_df, target, batch_size=128, mode="train"):
    dataset = tf.data.Dataset.from_tensor_slices(((category_df, numeric_df), target))
    dataset = dataset.map(preprocess)
    if mode == "train":
        dataset = dataset.shuffle(buffer_size=batch_size)
    dataset = dataset.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
    return dataset

## Model Development

In [None]:
categorical_features = ["HomePlanet", "Destination", "Deck", "Side"]
numerical_fetures = ["CryoSleep", "Age", "VIP", "RoomService", "Num", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpend", "PctRoomService", "PctFoodCourt", "PctShoppingMall", "PctSpa", "PctVRDeck"]

### Create Normalization Layer

In [None]:
normalization_layer = keras.layers.Normalization()
with tf.device("CPU"):
    normalization_layer.adapt(train[numerical_fetures])

### Create Lookup layers

In [None]:
%%time
lookupLayersMap = dict()
for column in categorical_features:
    unique_values = list(train[column].unique())
    lookupLayersMap[column] = tf.keras.layers.StringLookup(vocabulary=unique_values)

### Model Builing

In [None]:
def get_model():
    categorical_inputs = []
    categorical_vectors = []
    for column in categorical_features:
        categorical_input = keras.Input(shape=(1, ), name=f"{column}", dtype=tf.string)
        lookup = lookupLayersMap[column]
        vocab_size = len(lookup.get_vocabulary())
        embed_dimension = min(math.ceil(np.sqrt(vocab_size)), 16)
        categorical_vector = lookup(categorical_input)
        categorical_vector = keras.layers.Embedding(vocab_size, embed_dimension, input_length=1)(categorical_vector)
        categorical_vector = keras.layers.Reshape((-1,))(categorical_vector)
        categorical_inputs.append(categorical_input)
        categorical_vectors.append(categorical_vector)
        
        
    categorcal_vector = keras.layers.Concatenate(axis=-1)(categorical_vectors)
    categorcal_vector = keras.layers.Dense(128, activation="relu")(categorcal_vector)
    categorcal_vector = keras.layers.Dropout(0.4)(categorcal_vector)
    categorcal_vector = keras.layers.BatchNormalization()(categorcal_vector)
    categorcal_vector = keras.layers.Dense(128, activation="relu")(categorcal_vector)
    categorcal_vector = keras.layers.Dropout(0.4)(categorcal_vector)
    categorcal_vector = keras.layers.BatchNormalization()(categorcal_vector)
    categorcal_vector = keras.layers.Dense(128, activation="relu")(categorcal_vector)
    categorcal_vector = keras.layers.Dropout(0.4)(categorcal_vector)
    categorcal_vector = keras.layers.BatchNormalization()(categorcal_vector)
    
    numeric_input =  keras.Input(shape=(len(numerical_fetures), ))
    numeric_vector = normalization_layer(numeric_input)
    numeric_vector = keras.layers.Dense(128, activation="relu")(numeric_vector)
    numeric_vector = keras.layers.Dropout(0.4)(numeric_vector)
    numeric_vector = keras.layers.BatchNormalization()(numeric_vector)
    numeric_vector = keras.layers.Dense(128, activation="relu")(numeric_vector)
    numeric_vector = keras.layers.Dropout(0.4)(numeric_vector)
    numeric_vector = keras.layers.BatchNormalization()(numeric_vector)
    numeric_vector = keras.layers.Dense(128, activation="relu")(numeric_vector)
    numeric_vector = keras.layers.Dropout(0.4)(numeric_vector)
    numeric_vector = keras.layers.BatchNormalization()(numeric_vector)
    
    vector = keras.layers.Concatenate(axis=-1)([categorcal_vector, numeric_vector])
    vector = keras.layers.Dense(32, activation="relu")(vector)
    output = keras.layers.Dense(1, activation="sigmoid")(vector)
    model = keras.Model(inputs=categorical_inputs + [numeric_input], outputs=output)
    adam = tfa.optimizers.AdamW(
        learning_rate=3e-4, weight_decay=0.0001
    )
    model.compile(loss="binary_crossentropy", optimizer=adam, metrics=["accuracy"])
    return model

In [None]:
model = get_model()
model.summary()

In [None]:
keras.utils.plot_model(model, show_shapes=True)

### Model Training

In [None]:
models = []
kfold = StratifiedKFold(7, shuffle=True, random_state=2022)
for fold, (train_indices, valid_indices) in enumerate(kfold.split(train, train_targets)):
    x_train = train.iloc[train_indices]
    x_val = train.iloc[valid_indices]
    y_train = train_targets.iloc[train_indices]
    y_val = train_targets.iloc[valid_indices]
    train_ds = make_dataset(x_train[categorical_features], x_train[numerical_fetures], y_train, mode="train")
    valid_ds = make_dataset(x_val[categorical_features], x_val[numerical_fetures], y_val)
    cp = keras.callbacks.ModelCheckpoint(f"model_{fold}.tf", monitor="val_accuracy", save_best_only=True, save_weights_only=True)
    es = keras.callbacks.EarlyStopping(patience=10)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
    model = get_model()
    history = model.fit(train_ds, epochs=50, validation_data=valid_ds, callbacks=[cp, es])
    pd.DataFrame(history.history).plot()
    model.load_weights(f"model_{fold}.tf")
    models.append(model)

## Submission

In [None]:
def preprocess_test(category, numeric):
    return ((category[0], category[1], category[2], category[3]), numeric), 0
def make_test_dataset(category_df, numeric_df, batch_size=32):
    dataset = tf.data.Dataset.from_tensor_slices((category_df, numeric_df))
    dataset = dataset.map(preprocess_test)
    dataset = dataset.batch(batch_size)
    return dataset
def inference(ds, models):
    y_pred = np.mean([model.predict(ds) for model in models], axis=0)
    y_pred = np.array(y_pred > 0.5, dtype=np.bool_)
    return y_pred

In [None]:
test_ds =  make_test_dataset(test[categorical_features], test[numerical_fetures])
test_ds

In [None]:
submission = pd.read_csv("../input/spaceship-titanic/sample_submission.csv")
submission["Transported"] = inference(test_ds, models)
submission.to_csv("submission.csv", index=False)
submission.head()