# 1. Load libraries and data

In [None]:
#Linear algebra and appearance
import pandas as pd
pd.options.display.float_format = '{:,.4f}'.format
#I want to see all features from the dataset given. But be careful, sometimes the output can be too large!
pd.options.display.max_rows = None 
pd.set_option('max_colwidth', 260)
import numpy as np

#Visualization setup
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import ticker as tkr
from textwrap import wrap

#Chosen models
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

#Used metrics
from sklearn.metrics import accuracy_score #this one is not necessary
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error

#Data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv', index_col = 0)
test = pd.read_csv('../input/spaceship-titanic/test.csv', index_col = 0)
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv', index_col = 0)

## 1.1. Constants

In [None]:
#epochs = 60 #last layer's config required 60 epochs
#batch_size = 2048
random_state = 42
#ntrain = train.shape[0]
#ntest = test.shape[0]
folds = 5

# 2. EDA

In [None]:
print('The train data has {} rows and {} columns. Memory usage: {}KB.'.format(train.shape[0], train.shape[1], round(train.memory_usage().sum()/1024, 1)))
print('The test data has {} rows and {} columns. Memory usage: {}KB.'.format(test.shape[0], test.shape[1], round(test.memory_usage().sum()/1024, 1)))

In [None]:
train.head()

In [None]:
test.head()

#### Cabin = deck/num/side

In [None]:
train[['Deck', 'Num', 'Side']] = train['Cabin'].str.split('/', expand=True)
test[['Deck', 'Num', 'Side']] = test['Cabin'].str.split('/', expand=True)

In [None]:
#Create missing values dataframe
missing_train = pd.concat([train.isna().sum().sort_values(ascending = False), train.dtypes], axis = 1, keys = ['Total', 'Type'])

In [None]:
pd.concat([train.loc[:, :].describe().T, missing_train, train.nunique()], axis = 1).rename(columns = {'Total': 'Total missing',
                                                                                                      0: 'Unique values'}).style.bar(subset = ['mean'], color="#e9c46a")\
                            .background_gradient(subset = ['std', 'Total missing'], cmap = 'Reds')\
                            .background_gradient(subset = ['50%'], cmap = 'Pastel1')

In [None]:
train['Age'] = train.loc[:, 'Age'].fillna(value = int(train['Age'].mean()))
train['RoomService'] = train.loc[:, 'RoomService'].fillna(value = int(train['RoomService'].mean()))
train['FoodCourt'] = train.loc[:, 'FoodCourt'].fillna(value = int(train['FoodCourt'].mean()))
train['ShoppingMall'] = train.loc[:, 'ShoppingMall'].fillna(value = int(train['ShoppingMall'].mean()))
train['Spa'] = train.loc[:, 'Spa'].fillna(value = int(train['Spa'].mean()))
train['VRDeck'] = train.loc[:, 'VRDeck'].fillna(value = int(train['VRDeck'].mean()))

In [None]:
test['Age'] = test.loc[:, 'Age'].fillna(value = int(train['Age'].mean()))
test['RoomService'] = test.loc[:, 'RoomService'].fillna(value = int(train['RoomService'].mean()))
test['FoodCourt'] = test.loc[:, 'FoodCourt'].fillna(value = int(train['FoodCourt'].mean()))
test['ShoppingMall'] = test.loc[:, 'ShoppingMall'].fillna(value = int(train['ShoppingMall'].mean()))
test['Spa'] = test.loc[:, 'Spa'].fillna(value = int(train['Spa'].mean()))
test['VRDeck'] = test.loc[:, 'VRDeck'].fillna(value = int(train['VRDeck'].mean()))

In [None]:
train['Passengers_in_cabin'] = train['Cabin'].groupby(train['Cabin']).transform('count')
test['Passengers_in_cabin'] = test['Cabin'].groupby(test['Cabin']).transform('count')

## 2.1. Non-numeric data distribution

In [None]:
nrows = 6
ncols = 2
fig, axes = plt.subplots(nrows, ncols, figsize = (10, 25))
axes = axes.flatten()
sns.countplot(data = train, x = train['HomePlanet'], fill = True, ax = axes[0], order = train['HomePlanet'].value_counts().index)
sns.countplot(data = train, x = train['CryoSleep'], fill = True, ax = axes[2])
sns.countplot(data = train, x = train['Destination'], fill = True, ax = axes[4], order = train['Destination'].value_counts().index)
sns.countplot(data = train, x = train['VIP'], fill = True, ax = axes[6])
sns.countplot(data = train, x = train['Deck'], fill = True, ax = axes[8], order = train['Deck'].value_counts().index)
sns.countplot(data = train, x = train['Side'], fill = True, ax = axes[10], order = train['Side'].value_counts().index)
sns.countplot(data = test, x = test['HomePlanet'], fill = True, ax = axes[1], order = train['HomePlanet'].value_counts().index)
sns.countplot(data = test, x = test['CryoSleep'], fill = True, ax = axes[3])
sns.countplot(data = test, x = test['Destination'], fill = True, ax = axes[5], order = train['Destination'].value_counts().index)
sns.countplot(data = test, x = test['VIP'], fill = True, ax = axes[7])
sns.countplot(data = test, x = test['Deck'], fill = True, ax = axes[9], order = train['Deck'].value_counts().index)
sns.countplot(data = test, x = test['Side'], fill = True, ax = axes[11], order = train['Side'].value_counts().index)
plt.show()

In [None]:
#And the target variable
train['Transported'].value_counts().plot(kind = 'bar')

## 2.2. Numeric data distribution and engineering

In [None]:
train_num = train[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]
test_num = test[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

nrows = 3
ncols = 2
fig, axes = plt.subplots(nrows, ncols, figsize = (16, 16))
axes = axes.flatten()
labels = ['Train', 'Test']
for idx, ax in enumerate(axes):
    sns.kdeplot(data = train_num, 
                x = train_num.iloc[:, idx], 
                fill = True, 
                ax = ax, 
                color = '#5047ff', 
                label = labels[0])
    sns.kdeplot(data = test_num, 
                x = test_num.iloc[:, idx], 
                fill = True, 
                ax = ax, 
                color = '#ffa647', 
                label = labels[1])
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.spines['left'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.set_title('\n'.join(wrap(train_num.columns[idx])), 
                 loc = 'center', weight = 'bold', fontsize = 10, wrap = True)
    #plt.text(f'f{idx}')

fig.legend(loc = 'upper center', ncol = 2, borderaxespad = 0., labels = labels)
fig.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.histplot(data = train['Passengers_in_cabin'])
ax.set_title('Cabin population distribution')
plt.show()

### 2.2.1. Check outliers in train data

#### Age

In [None]:
sns.boxplot(data = train_num['Age'], orient = 'h')

In [None]:
age1 = train_num['Age'].quantile([.75, .25]).values[0] + 1.5 * (train_num['Age'].quantile([.75, .25]).values[0] - train_num['Age'].quantile([.75, .25]).values[1])
age2 = train_num['Age'].quantile([.75, .25]).values[0] - 1.5 * (train_num['Age'].quantile([.75, .25]).values[0] - train_num['Age'].quantile([.75, .25]).values[1])
age3 = train_num['Age'].quantile([.75, .25]).values[0] + 3 * (train_num['Age'].quantile([.75, .25]).values[0] - train_num['Age'].quantile([.75, .25]).values[1])
age4 = train_num['Age'].quantile([.75, .25]).values[0] - 3 * (train_num['Age'].quantile([.75, .25]).values[0] - train_num['Age'].quantile([.75, .25]).values[1])

fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.histplot(data = train_num['Age'])
ax.set_title('Passengers age distribution')
plt.axvline(age1, 0, 1, color = 'black')
plt.axvline(age2, 0, 1, color = 'black')
plt.figtext(1, .6, 'Outliers are outside {} and {}'.format(age1, age2))
plt.axvline(age3, 0, 1, color = 'red')
plt.axvline(age4, 0, 1, color = 'red')
plt.figtext(1, .5, 'Extreme outliers are outside {} and {}'.format(age3, age4), color = 'red')
plt.show()

#### I can't remove outliers below min, because there are kids onboard. And there are no extreme outliers, so this will be ok.

In [None]:
print('{} out of {} passengers are outliers by age, which is {}% of our data.'.format(train_num[train_num['Age'] > age1].shape[0], 
                                                                                      train_num.shape[0],
                                                                                     round(100*(train_num[train_num['Age'] > age1].shape[0] / train_num.shape[0]),2)))

#### RoomService

In [None]:
sns.boxplot(data = train_num['RoomService'], orient = 'h')

In [None]:
room1 = train_num['RoomService'].quantile([.75, .25]).values[0] + 1.5 * (train_num['RoomService'].quantile([.75, .25]).values[0] - train_num['RoomService'].quantile([.75, .25]).values[1])
room2 = train_num['RoomService'].quantile([.75, .25]).values[0] - 1.5 * (train_num['RoomService'].quantile([.75, .25]).values[0] - train_num['RoomService'].quantile([.75, .25]).values[1])
room3 = train_num['RoomService'].quantile([.75, .25]).values[0] + 3 * (train_num['RoomService'].quantile([.75, .25]).values[0] - train_num['RoomService'].quantile([.75, .25]).values[1])
room4 = train_num['RoomService'].quantile([.75, .25]).values[0] - 3 * (train_num['RoomService'].quantile([.75, .25]).values[0] - train_num['RoomService'].quantile([.75, .25]).values[1])

fig, ax = plt.subplots(figsize=(12, 6))
ax = sns.histplot(data = train_num['RoomService'])
ax.set_title('Passengers room service expense distribution')
plt.axvline(room1, 0, 1, color = 'black')
plt.axvline(room2, 0, 1, color = 'black')
plt.figtext(1, .6, 'Outliers are outside {} and {}'.format(room1, room2))
plt.axvline(room3, 0, 1, color = 'red')
plt.axvline(room4, 0, 1, color = 'red')
plt.figtext(1, .5, 'Extreme outliers are outside {} and {}'.format(room3, room4), color = 'red')
plt.show()

In [None]:
print('{} out of {} passengers are outliers by room service, which is {}% of our data.'.format(train_num[train_num['RoomService'] > room1].shape[0], 
                                                                                      train_num.shape[0],
                                                                                     round(100*(train_num[train_num['RoomService'] > room1].shape[0] / train_num.shape[0]),2)))
print('{} out of {} passengers are extreme outliers by room service, which is {}% of our data.'.format(train_num[train_num['RoomService'] > room3].shape[0], 
                                                                                      train_num.shape[0],
                                                                                     round(100*(train_num[train_num['RoomService'] > room3].shape[0] / train_num.shape[0]),2)))

#### Not good, but what if I turn these five to categorical? Quantiles are choosen experimentally

In [None]:
train_num[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9, .95, .99, 1.])

In [None]:
train['RoomService_cat'] = pd.cut(x = train_num['RoomService'], 
                                  bins = train_num['RoomService'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])
test['RoomService_cat'] = pd.cut(x = test_num['RoomService'], 
                                  bins = train_num['RoomService'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])

In [None]:
train['FoodCourt_cat'] = pd.cut(x = train_num['FoodCourt'], 
                                  bins = train_num['FoodCourt'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])
test['FoodCourt_cat'] = pd.cut(x = test_num['FoodCourt'], 
                                  bins = train_num['FoodCourt'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])

In [None]:
train['ShoppingMall_cat'] = pd.cut(x = train_num['ShoppingMall'], 
                                  bins = train_num['ShoppingMall'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])
test['ShoppingMall_cat'] = pd.cut(x = test_num['ShoppingMall'], 
                                  bins = train_num['ShoppingMall'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])

In [None]:
train['Spa_cat'] = pd.cut(x = train_num['Spa'], 
                                  bins = train_num['Spa'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])
test['Spa_cat'] = pd.cut(x = test_num['Spa'], 
                                  bins = train_num['Spa'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])

In [None]:
train['VRDeck_cat'] = pd.cut(x = train_num['VRDeck'], 
                                  bins = train_num['VRDeck'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])
test['VRDeck_cat'] = pd.cut(x = test_num['VRDeck'], 
                                  bins = train_num['VRDeck'].quantile([.6, .7, .8, .9, .95, .99, 1.]).values, 
                                  include_lowest = True, 
                                  precision = 3, 
                                  labels = [1, 2, 3, 4, 5, 6])

### Fact: age distribution looks like it's of human species. I will categorize them respectively - as toddlers, kids, teens, adults and elders.

In [None]:
train['Age_cat'] = pd.cut(x = train_num['Age'], 
                                  bins = [0, 5, 13, 19, 65, train_num['Age'].max()], 
                                  include_lowest = True, 
                                  precision = 1, 
                                  labels = [1, 2, 3, 4, 5])
test['Age_cat'] = pd.cut(x = test_num['Age'], 
                                  bins = [0, 5, 13, 19, 65, test_num['Age'].max()],
                                  include_lowest = True, 
                                  precision = 1, 
                                  labels = [1, 2, 3, 4, 5])

In [None]:
train['Age_cat'].value_counts()

In [None]:
train[['Is_toddler', 'Is_kid', 'Is_teen', 'Is_adult', 'Is_elder']] = pd.get_dummies(train['Age_cat'], prefix = 'Age_cat')
test[['Is_toddler', 'Is_kid', 'Is_teen', 'Is_adult', 'Is_elder']] = pd.get_dummies(test['Age_cat'], prefix = 'Age_cat')

In [None]:
train.head(15)

In [None]:
train_age_groups_cabin = train.groupby(by = ['Cabin', 'Age_cat']).Age_cat.count().unstack(fill_value = 0).rename(columns = {
    1: 'Toddlers_in_cabin',
    2: 'Kids_in_cabin',
    3: 'Teens_in_cabin',
    4: 'Adults_in_cabin',
    5: 'Elders_in_cabin',
})

test_age_groups_cabin = test.groupby(by = ['Cabin', 'Age_cat']).Age_cat.count().unstack(fill_value = 0).rename(columns = {
    1: 'Toddlers_in_cabin',
    2: 'Kids_in_cabin',
    3: 'Teens_in_cabin',
    4: 'Adults_in_cabin',
    5: 'Elders_in_cabin',
})

In [None]:
train = pd.merge(train, train_age_groups_cabin, how = 'outer', on = 'Cabin')
test = pd.merge(test, test_age_groups_cabin, how = 'outer', on = 'Cabin')

# 3. Models

### To prevent future errors I will make a separate encoder for each column.

In [None]:
le_plan = LabelEncoder()
train['HomePlanet'] = le_plan.fit_transform(train['HomePlanet'])
test['HomePlanet'] = le_plan.transform(test['HomePlanet'])

In [None]:
le_cryo = LabelEncoder()
train['CryoSleep'] = le_cryo.fit_transform(train['CryoSleep'])
test['CryoSleep'] = le_cryo.transform(test['CryoSleep'])

In [None]:
le_dest = LabelEncoder()
train['Destination'] = le_dest.fit_transform(train['Destination'])
test['Destination'] = le_dest.transform(test['Destination'])

In [None]:
le_deck = LabelEncoder()
train['Deck'] = le_deck.fit_transform(train['Deck'])
test['Deck'] = le_deck.transform(test['Deck'])

In [None]:
le_side = LabelEncoder()
train['Side'] = le_side.fit_transform(train['Side'])
test['Side'] = le_side.transform(test['Side'])

In [None]:
le_trans = LabelEncoder()
train['Transported'] = le_trans.fit_transform(train['Transported'])
#test['Side'] = le_side.transform(test['Side'])

In [None]:
X = train.drop(columns = ['Name', 'Cabin', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'Age_cat',
                         'Is_toddler', 'Is_kid', 'Is_teen', 'Is_adult', 'Is_elder'])
Y = train['Transported']
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size = 0.85, random_state = random_state)
X_test = test.drop(columns = ['Name', 'Cabin', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Age_cat',
                             'Is_toddler', 'Is_kid', 'Is_teen', 'Is_adult', 'Is_elder'])

## 3.1. CatBoostClassifier

### 3.1.1. Out-of-the-box model

In [None]:
model = CatBoostClassifier(random_seed = random_state,
                           cat_features = ['RoomService_cat', 'FoodCourt_cat', 'ShoppingMall_cat', 'Spa_cat', 'VRDeck_cat'],
                           logging_level = 'Silent',
                           #loss_function = 'Accuracy'
                          )

model.fit(X_train,
          Y_train,
          eval_set = (X_validation, Y_validation))

predictions = model.predict(X_validation)
#predictions = np.rint(predictions)
accuracy = accuracy_score(Y_validation, predictions)
accuracy

In [None]:
conf = confusion_matrix(Y_validation, predictions)

In [None]:
fig, axes = plt.subplots(figsize = (15, 10))
ax = sns.heatmap(conf / np.sum(conf), 
                 annot = True,
                 fmt = '.2%', 
                 cmap = 'Blues')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(train['Transported'].unique().tolist())
ax.yaxis.set_ticklabels(train['Transported'].unique().tolist())
plt.xticks(rotation = 0, ha = 'right')
plt.yticks(rotation = 0)
plt.show()

### 3.1.2. Welcome, Optuna!

In [None]:
def objective(trial):
    X = train.drop(columns = ['Name', 'Cabin', 'Num', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported', 'Age_cat',
                         'Is_toddler', 'Is_kid', 'Is_teen', 'Is_adult', 'Is_elder'])
    Y = train['Transported']
    X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, train_size = 0.85, random_state = random_state)

    param = {
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.433, 0.4378),
        'learning_rate': trial.suggest_float('learning_rate', 0.7596, 0.75988),
        'n_estimators': trial.suggest_int('n_estimators', 100, 15000),
        #'max_bin': trial.suggest_int('max_bin', 13, 19),
        #'max_depth': trial.suggest_int('max_depth', 4, 6),
        'loss_function': trial.suggest_categorical('loss_function', ['Logloss', 'CrossEntropy']),
        'used_ram_limit': '16gb',
    }

    gbm = CatBoostClassifier(**param,
                             random_seed = random_state,
                             cat_features = ['RoomService_cat', 'FoodCourt_cat', 'ShoppingMall_cat', 'Spa_cat', 'VRDeck_cat'],
                             logging_level = 'Silent',
                             max_bin = 16,
                             max_depth = 5,
                             eval_metric = 'Accuracy')

    gbm.fit(X_train, Y_train, eval_set = [(X_validation, Y_validation)], early_stopping_rounds = 400)

    preds = gbm.predict(X_validation)
    #pred_labels = np.rint(preds)
    accuracy = accuracy_score(Y_validation, preds)
    return accuracy
        
if __name__ == '__main__':
    study = optuna.create_study(direction = 'maximize')
    study.optimize(objective, timeout = 300)#, n_trials = 2000) #Hyperparameters already estimated, reduced this values just to show the plot 

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))

In [None]:
#Visualize the Optuna output to see if more steps are needed.
optuna.visualization.plot_slice(study)

In [None]:
catbcl_tuned = CatBoostClassifier(random_seed = random_state,
                                  cat_features = ['RoomService_cat', 'FoodCourt_cat', 'ShoppingMall_cat', 'Spa_cat', 'VRDeck_cat'],
                                  logging_level = 'Silent',
                                  #all these parameters are result of Optuna optimization process
                                  max_bin = 16,
                                  max_depth = 5,
                                  l2_leaf_reg = 0.43383301883210423,
                                  learning_rate = 0.7598728553132276,
                                  n_estimators = 4801,
                                  loss_function = 'CrossEntropy',
                                  #**study.best_trial.params
    )
    
catbcl_tuned.fit(X_train,
                 Y_train,
                 eval_set = (X_validation, Y_validation))

predictions = catbcl_tuned.predict(X_validation)
accuracy_tuned = accuracy_score(Y_validation, predictions)
accuracy_tuned

In [None]:
conf = confusion_matrix(Y_validation, predictions)

In [None]:
fig, axes = plt.subplots(figsize = (15, 10))
ax = sns.heatmap(conf / np.sum(conf), 
                 annot = True,
                 fmt = '.2%', 
                 cmap = 'Blues')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(train['Transported'].unique().tolist())
ax.yaxis.set_ticklabels(train['Transported'].unique().tolist())
plt.xticks(rotation = 0, ha = 'right')
plt.yticks(rotation = 0)
plt.show()

# 4. Submission

In [None]:
submission['Transported'] = model.predict(X_test).astype(bool)#.fillna(0))

In [None]:
submission.to_csv('catbclassifier_long_version.csv')