# 1. Load libraries and data

In [None]:
#Linear algebra and appearance
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
#I want to see all features from the dataset given. But be careful, sometimes the output can be too large!
pd.options.display.max_rows = None 
pd.set_option('max_colwidth', 260)
import numpy as np

#Visualization setup
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import ticker as tkr
from textwrap import wrap

#Chosen models
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import KFold

#Used metrics
from sklearn.metrics import accuracy_score #this one is not necessary
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error

#Data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv', index_col = 0)
test = pd.read_csv('../input/spaceship-titanic/test.csv', index_col = 0)
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

## 1.1. Constants

In [None]:
epochs = 250 #last layer's config required 60 epochs
batch_size = 1024
random_state = 42
#ntrain = train.shape[0]
#ntest = test.shape[0]
folds = 11

# 2. EDA

In [None]:
print('The train data has {} rows and {} columns. Memory usage: {}KB.'.format(train.shape[0], train.shape[1], round(train.memory_usage().sum()/1024, 1)))
print('The test data has {} rows and {} columns. Memory usage: {}KB.'.format(test.shape[0], test.shape[1], round(test.memory_usage().sum()/1024, 1)))

In [None]:
train.head()

In [None]:
test.head()

#### Cabin = deck/num/side

In [None]:
train[['Deck', 'Num', 'Side']] = train['Cabin'].str.split('/', expand = True)
test[['Deck', 'Num', 'Side']] = test['Cabin'].str.split('/', expand = True)

In [None]:
#Create missing values dataframe
missing_train = pd.concat([train.isna().sum().sort_values(ascending = False), train.dtypes], axis = 1, keys = ['Total', 'Type'])

In [None]:
pd.concat([train.loc[:, :].describe().T, missing_train, train.nunique()], axis = 1).rename(columns = {'Total': 'Total missing',
                                                                                                      0: 'Unique values'}).style.bar(subset = ['mean'], color="#e9c46a")\
                            .background_gradient(subset = ['std', 'Total missing'], cmap = 'Reds')\
                            .background_gradient(subset = ['50%'], cmap = 'Pastel1')

## 2.1. Non-numeric data distribution

In [None]:
nrows = 6
ncols = 2
fig, axes = plt.subplots(nrows, ncols, figsize = (10, 25))
axes = axes.flatten()
sns.countplot(data = train, x = train['HomePlanet'], fill = True, ax = axes[0], order = train['HomePlanet'].value_counts().index)
sns.countplot(data = train, x = train['CryoSleep'], fill = True, ax = axes[2])
sns.countplot(data = train, x = train['Destination'], fill = True, ax = axes[4], order = train['Destination'].value_counts().index)
sns.countplot(data = train, x = train['VIP'], fill = True, ax = axes[6])
sns.countplot(data = train, x = train['Deck'], fill = True, ax = axes[8], order = train['Deck'].value_counts().index)
sns.countplot(data = train, x = train['Side'], fill = True, ax = axes[10], order = train['Side'].value_counts().index)
sns.countplot(data = test, x = test['HomePlanet'], fill = True, ax = axes[1], order = train['HomePlanet'].value_counts().index)
sns.countplot(data = test, x = test['CryoSleep'], fill = True, ax = axes[3])
sns.countplot(data = test, x = test['Destination'], fill = True, ax = axes[5], order = train['Destination'].value_counts().index)
sns.countplot(data = test, x = test['VIP'], fill = True, ax = axes[7])
sns.countplot(data = test, x = test['Deck'], fill = True, ax = axes[9], order = train['Deck'].value_counts().index)
sns.countplot(data = test, x = test['Side'], fill = True, ax = axes[11], order = train['Side'].value_counts().index)
plt.show()

In [None]:
#And the target variable
train['Transported'].value_counts().plot(kind = 'bar')

## 2.2. Feature engineering for DL

In [None]:
train.head()

In [None]:
le_plan = LabelEncoder()
train['HomePlanet'] = le_plan.fit_transform(train['HomePlanet'])
test['HomePlanet'] = le_plan.transform(test['HomePlanet'])

In [None]:
le_cryo = LabelEncoder()
train['CryoSleep'] = le_cryo.fit_transform(train['CryoSleep'])
test['CryoSleep'] = le_cryo.transform(test['CryoSleep'])

In [None]:
le_dest = LabelEncoder()
train['Destination'] = le_dest.fit_transform(train['Destination'])
test['Destination'] = le_dest.transform(test['Destination'])

In [None]:
le_deck = LabelEncoder()
train['Deck'] = le_deck.fit_transform(train['Deck'])
test['Deck'] = le_deck.transform(test['Deck'])

In [None]:
le_side = LabelEncoder()
train['Side'] = le_side.fit_transform(train['Side'])
test['Side'] = le_side.transform(test['Side'])

In [None]:
le_vip = LabelEncoder()
train['VIP'] = le_vip.fit_transform(train['VIP'])
test['VIP'] = le_vip.transform(test['VIP'])

In [None]:
le_trans = LabelEncoder()
train['Transported'] = le_trans.fit_transform(train['Transported'])
#test['Side'] = le_side.transform(test['Side'])

In [None]:
train.dtypes

In [None]:
train = train.drop(columns = ['Name', 'Cabin', 'Num'])
test = test.drop(columns = ['Name', 'Cabin', 'Num'])
train = train.fillna(0)
test = test.fillna(0)

In [None]:
train.head()

In [None]:
test.head()

# 3. Models

In [None]:
X = train.drop(columns = ['Transported']).values
Y = train['Transported'].values
X_test = test.values

In [None]:
X_test.shape

In [None]:
def get_model():
    act = 'relu'
    last_act = 'relu'
    model = keras.Sequential([
        layers.Dense(4277, activation = act, input_shape = test.shape[1:]),
        layers.Normalization(),
        layers.Dense(2138, activation = act),
        layers.Dropout(0.4),
        layers.BatchNormalization(),
        layers.Dense(1069, activation = act),
        layers.Dropout(0.4),
        layers.BatchNormalization(),
        layers.Dense(534, activation = act),
        layers.Dropout(0.4),
        layers.BatchNormalization(),
        layers.Dense(267, activation = act),
        layers.Dropout(0.4),
        layers.BatchNormalization(),
        layers.Dense(32, activation = act),
        layers.Dense(1, activation = last_act)
        
    ])
    
    model.compile(optimizer = keras.optimizers.Adam(learning_rate = 5e-6),
                  loss = 'MeanAbsoluteError',
                  metrics = ['MeanAbsoluteError']) 
    
    return model
   
get_model().summary()

In [None]:
'''scheduler = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate = 0.001,
                                                           decay_steps = 10000, 
                                                           decay_rate = 0.00001)'''
cb_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss',
                                             factor = 0.5,
                                             patience = 5)
cb_es = tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy',
                                         patience = 25, 
                                         restore_best_weights = True)

In [None]:
cv = KFold(n_splits = folds, 
                     shuffle = True, 
                     random_state = random_state)
test_preds = []
mean_score = 0

for fold, (train_idx, val_idx) in enumerate(cv.split(X, Y)):
    X_train, y_train = X[train_idx], Y[train_idx]
    X_val, y_val = X[val_idx], Y[val_idx]

    #scaler = StandardScaler()

    #X_train = scaler.fit_transform(X_train)
    #X_val = scaler.transform(X_val)
    #X_test = scaler.transform(test)
    
    #sample-wise L2 normalizing
    #normalizer = Normalizer()
    
    #X_train = normalizer.fit_transform(X_train)
    #X_val = normalizer.transform(X_val)
    #X_test = normalizer.transform(test)

    model = get_model()

    history = model.fit(
        X_train,
        y_train,
        validation_data = (X_val, y_val),
        epochs = epochs,
        batch_size = batch_size,
        callbacks = [cb_es, cb_lr],
        #verbose = True
    )

    #y_pred = np.argmax(model.predict(X_val), axis = 1)
    y_pred = model.predict(X_val)
    score = mean_absolute_error(y_val, y_pred)
    mean_score += score
    
    plt.figure(figsize = (12, 6))
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    ep = range(1, len(loss) + 1)

    plt.plot(ep, loss, 'r')
    plt.plot(ep, val_loss, 'b')
    plt.legend(['Training', 'Validation'])
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and validation loss for Fold # {}. Score is {}'.format(fold, score))
    plt.show()
    
    #test_preds.append(np.argmax(model.predict(X_test), axis = 1))
    test_preds.append(model.predict(X_test))

print('Mean score of all folds is {}'.format(mean_score / folds))

# 4. Submission

In [None]:
submission['Transported'] = sum(test_preds) / folds
submission['Transported'] = submission['Transported'].astype(bool)
submission.to_csv('submission.csv', index = False)

In [None]:
submission['Transported'].value_counts()