## Importing Modules

In [None]:
import numpy as np
import pandas as pd
import missingno

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
%matplotlib inline

from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import PowerTransformer

!pip install -q pycaret
from pycaret.classification import *

import warnings
warnings.filterwarnings('ignore')

## Reading the data

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
train

## EDA

In [None]:
print('Nan values in the training dataset : ',train.isna().sum().sum())
print('Duplicate values in the training dataset : ',train.duplicated().sum())

In [None]:
print('Nan values in the testing dataset : ',test.isna().sum().sum())
print('Duplicate values in the testing dataset : ',test.duplicated().sum())

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
missingno.matrix(train, figsize = (10,5))
plt.show()

In [None]:
missingno.matrix(test, figsize = (10,5))
plt.show()

Extracted features from 'CABIN' column.
- Cabin deck 
- Cabin num
- Cabin Side

In [None]:
train['Transported'] = train['Transported'].replace({True:'Yes',False:'No'})

train['Cabin_deck'] = train['Cabin'].str.extract(pat = '(\w+)\/\d+\/\w+')
train['Cabin_num_id'] = train['Cabin'].str.extract(pat = '\w+\/(\d+)\/\w+')
train['Cabin_num_id'] = pd.to_numeric(train['Cabin_num_id'], errors = 'coerce').astype(pd.Int64Dtype())
train['Cabin_side'] = train['Cabin'].str.extract(pat = '\w+\/\d+\/(\w+)')

test['Cabin_deck'] = test['Cabin'].str.extract(pat = '(\w+)\/\d+\/\w+')
test['Cabin_num_id'] = test['Cabin'].str.extract(pat = '\w+\/(\d+)\/\w+')
test['Cabin_num_id'] = pd.to_numeric(test['Cabin_num_id'], errors = 'coerce').astype(pd.Int64Dtype())
test['Cabin_side'] = test['Cabin'].str.extract(pat = '\w+\/\d+\/(\w+)')

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
train['Transported'].value_counts().plot(kind = 'pie', autopct = '%.2f%%')
plt.subplot(1,2,2)
sns.countplot(x = 'Transported', data = train)
plt.show()

In [None]:
plt.figure(figsize = (15,15))
for i,col in enumerate(train.select_dtypes('number').columns):
    plt.subplot(4,2,i+1)
    sns.kdeplot(x = col, data = train)
plt.show()

In [None]:
plt.figure(figsize = (15,15))
for i,col in enumerate(train.select_dtypes('number').columns):
    plt.subplot(4,2,i+1)
    sns.kdeplot(x = col,hue = 'Transported' ,data = train, shade = True)
plt.show()

In [None]:
plt.figure(figsize = (20,10))
for i,col in enumerate(train.drop(['PassengerId','Cabin','Name'], axis = 1).select_dtypes('object').columns):
    plt.subplot(3,3,i+1)
    sns.countplot(x = col, data = train)
plt.show()

In [None]:
plt.figure(figsize = (20,10))
for i,col in enumerate(train.drop(['PassengerId','Cabin','Name'], axis = 1).select_dtypes('object').columns):
    plt.subplot(3,3,i+1)
    sns.countplot(x = col,hue = 'Transported', data = train)
plt.show()

In [None]:
plt.figure(figsize = (12,10))
sns.heatmap(train.corr(), vmin = -1.0, vmax = 1.0, center = 0, cmap = 'RdBu_r', annot = True, data = train)
plt.show()

In [None]:
sns.pairplot(x_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
             y_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
             data = train, hue = 'Transported')
plt.show()

In [None]:
train = train.drop(['PassengerId','Cabin','Name'], axis = 1)
test_ids = test['PassengerId']
test = test.drop(['PassengerId','Cabin','Name'], axis = 1)

In [None]:
train

## Handling Missing Values

Here, there is a possiblity of data leakage. However, this is a competition notebook. Therefore, more data will help for proper imputation rather that following pipeline.

Combining train and test data

In [None]:
data = pd.concat([train.drop('Transported', axis = 1), test], axis =0).reset_index(drop = True)

In [None]:
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Cabin_num_id']
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_deck','Cabin_side']

Using KNN imputer for numerical columns.

In [None]:
knn_impute = KNNImputer(n_neighbors=7)
knn_impute.fit(data[num_cols])

In [None]:
train[num_cols] = pd.DataFrame(knn_impute.transform(train[num_cols]), columns = num_cols)
test[num_cols] = pd.DataFrame(knn_impute.transform(test[num_cols]), columns = num_cols)

Simple imputer for categorical columns using Mode.

In [None]:
si = SimpleImputer(strategy='most_frequent')
si.fit(data[cat_cols])

In [None]:
train[cat_cols] = pd.DataFrame(si.transform(train[cat_cols]), columns = cat_cols)
test[cat_cols] = pd.DataFrame(si.transform(test[cat_cols]), columns = cat_cols)

In [None]:
X = train.drop('Transported', axis = 1)
y = train['Transported']

## Preprocessing

I can't find any relationship between the values in categorical features. Therefore, I opted for OnehotEncoding. 

In [None]:
nominal_cols = ['VIP', 'Cabin_deck']
ordinal_cols = ['HomePlanet', 'CryoSleep', 'Cabin_side', 'Destination']

In [None]:
ohe = OneHotEncoder(sparse=False, drop = 'first')
ohe.fit(X[nominal_cols])

X = pd.concat([X,pd.DataFrame(ohe.fit_transform(X[nominal_cols]), 
             columns = ohe.get_feature_names(), index = X.index)],axis = 1)

X = X.drop(nominal_cols,axis = 1)

In [None]:
categories = [
    ['Earth','Mars','Europa'],
    [False,True],
    ['P','S'],
    ['TRAPPIST-1e','PSO J318.5-22','55 Cancri e']
             ]

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder(categories=categories)
oe.fit(X[ordinal_cols])

X[ordinal_cols] = pd.DataFrame(oe.transform(X[ordinal_cols]), columns = ordinal_cols, index = X.index)

In [None]:
pt = PowerTransformer()
pt.fit(X[num_cols])
X[num_cols] = pd.DataFrame(pt.transform(X[num_cols]), columns = num_cols, index = X.index)

## Scaling

Using StandardScaler for numerical columns.

In [None]:
sc = StandardScaler()
sc.fit(X[num_cols])
X[num_cols] = pd.DataFrame(sc.transform(X[num_cols]), columns = num_cols, index = X.index)

## Model Selection and Training

In [None]:
_ = setup(pd.concat([X,y], axis = 1), target = 'Transported', silent = True)

In [None]:
top5 = compare_models(n_select = 5)

In [None]:
model = create_model(top5[0])

## Model interpretations

SHAP Values for the top model.

In [None]:
interpret_model(model)

In [None]:
plot_model(model)

## Preparing for Submissions

In [None]:
test

In [None]:
test = pd.concat([test,pd.DataFrame(ohe.transform(test[nominal_cols]), 
             columns = ohe.get_feature_names(), index = test.index)],axis = 1)

test = test.drop(nominal_cols,axis = 1)

test[ordinal_cols] = pd.DataFrame(oe.transform(test[ordinal_cols]), columns = ordinal_cols, index = test.index)

test[num_cols] = pd.DataFrame(pt.transform(test[num_cols]), columns = num_cols, index = test.index)

test[num_cols] = pd.DataFrame(sc.transform(test[num_cols]), columns = num_cols, index = test.index)

In [None]:
test_pred = predict_model(model, data = test)['Label']

sub = pd.concat([test_ids, test_pred], axis = 1)
sub = sub.rename(columns = {'Label': 'Transported'})
sub['Transported'] = sub['Transported'].map({'Yes': True,'No': False})

In [None]:
sub

In [None]:
sub.to_csv('submission.csv', index = False)