## Importing modules

In [None]:
import numpy as np
import pandas as pd
import missingno

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('darkgrid')
%matplotlib inline
from sklearn import set_config
set_config(display="diagram")

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

## Reading the data

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
train

## EDA

In [None]:
viz = train.copy()

In [None]:
print('Nan values in the training dataset : ',train.isna().sum().sum())
print('Duplicate values in the training dataset : ',train.duplicated().sum())

In [None]:
print('Nan values in the testing dataset : ',test.isna().sum().sum())
print('Duplicate values in the testing dataset : ',test.duplicated().sum())

In [None]:
viz.info()

In [None]:
viz.describe()

Missing Values are random. We can't find any specific pattern or reason in missing values.

In [None]:
missingno.matrix(train, figsize = (10,5))
missingno.matrix(test, figsize = (10,5))
plt.show()

In [None]:
viz['PassengerGroup'] = viz['PassengerId'].str.extract(pat = '\d+_(\d+)').astype('int32')
viz['Transported'] = viz['Transported'].replace({True:'Yes',False:'No'})

viz['Cabin_deck'] = viz['Cabin'].str.extract(pat = '(\w+)\/\d+\/\w+')
viz['Cabin_num_id'] = viz['Cabin'].str.extract(pat = '\w+\/(\d+)\/\w+')
viz['Cabin_num_id'] = pd.to_numeric(viz['Cabin_num_id'], errors = 'coerce').astype(pd.Int64Dtype())
viz['Cabin_side'] = viz['Cabin'].str.extract(pat = '\w+\/\d+\/(\w+)')

Here we can see that, target is not skewed.<br>
Therefore, class imbalance can be skipped.

In [None]:
plt.figure(figsize = (10,5))
plt.subplot(1,2,1)
viz['Transported'].value_counts().plot(kind = 'pie', autopct = '%.2f%%')
plt.subplot(1,2,2)
sns.countplot(x = 'Transported', data = viz)
plt.show()

In [None]:
plt.figure(figsize = (15,15))
for i,col in enumerate(viz.select_dtypes('number').columns):
    plt.subplot(4,2,i+1)
    sns.kdeplot(x = col, data = viz)
plt.show()

We observed that, small age has high chance of selection than old age.

In [None]:
plt.figure(figsize = (15,15))
for i,col in enumerate(viz.select_dtypes('number').columns):
    plt.subplot(4,2,i+1)
    sns.kdeplot(x = col,hue = 'Transported' ,data = viz, shade = True)
plt.show()

In [None]:
plt.figure(figsize = (20,10))
for i,col in enumerate(viz.drop(['PassengerId','Cabin','Name'], axis = 1).select_dtypes('object').columns):
    plt.subplot(3,3,i+1)
    sns.countplot(x = col, data = viz)
plt.show()

Here, we can see that, some features like Cryosleep, VIP and CabinSide are significant factors for splitting.

In [None]:
plt.figure(figsize = (20,15))
for i,col in enumerate(viz.drop(['PassengerId','Cabin','Name'], axis = 1).select_dtypes('object').columns):
    plt.subplot(3,3,i+1)
    sns.countplot(y = col, hue = 'Transported', data = viz)
plt.show()

In [None]:
plt.figure(figsize = (12,10))
sns.heatmap(viz.corr(), vmin = -1.0, vmax = 1.0, center = 0, cmap = 'RdBu_r', annot = True, data = train)
plt.show()

In [None]:
sns.pairplot(x_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
             y_vars = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
             data = viz, hue = 'Transported')
plt.show()

## Pipeline

Splitting the data into inputs (X) and output (y).

In [None]:
X = train.drop(['Transported'], axis = 1)
y = train['Transported']

Creating a custom transformer to extract some useful features.

Features:<br>
- PassengerGroup = This feature represents the passenger number in a group.
- Cabin (Cabin_deck, Cabin_num_id, Cabin_side) as given in data description.

In [None]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        for col in X.columns:
            if col  == 'PassengerId':
                X['PassengerGroup'] = X['PassengerId'].str.extract(pat = '\d+_(\d+)').astype('int32')
                X = X.drop(col, axis = 1)
                
            elif col == 'Cabin':
                X['Cabin_deck'] = X['Cabin'].str.extract(pat = '(\w+)\/\d+\/\w+')
                X['Cabin_num_id'] = X['Cabin'].str.extract(pat = '\w+\/(\d+)\/\w+')
                X['Cabin_num_id'] = pd.to_numeric(X['Cabin_num_id'], errors = 'coerce').astype(pd.Int64Dtype())
                X['Cabin_side'] = X['Cabin'].str.extract(pat = '\w+\/\d+\/(\w+)')
                X = X.drop(col, axis = 1)
                
        return X

Segregating the features w.r.t their types.

In [None]:
cols = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name']
num_cols = ['PassengerGroup','Age','RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck','Cabin_num_id']
nom_cols = ['VIP', 'Cabin_deck']
ord_cols = ['HomePlanet', 'CryoSleep', 'Cabin_side', 'Destination']

categories_arr = [
                    ['Earth','Mars','Europa'],
                    [False,True],
                    ['P','S'],
                    ['TRAPPIST-1e','PSO J318.5-22','55 Cancri e']
             ]

## Creating transformers

Created a separate pipline for each independent preprocessing technique.

In [None]:
feature_transformer = Pipeline(steps=[
    ('feature', FeatureTransformer())
])

number_transformer = Pipeline(steps=[
    ('impute_num', KNNImputer(n_neighbors = 5)),
    ('scaling', StandardScaler())
])
ordinal_transformer = Pipeline(steps=[
    ('impute_ordinal', SimpleImputer(strategy = 'most_frequent')),
    ('ordinal', OrdinalEncoder(categories=categories_arr))
])

nominal_transformer = Pipeline(steps=[
    ('impute_nominal', SimpleImputer(strategy = 'most_frequent')),
    ('nominal', OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
])

remainder = 'drop' will drop remaining columns which are not transformed. <br>
We have dropped non useful columns like Name.

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ('numerical_preprocess', number_transformer, num_cols),
    ('ordinal_preprocess', ordinal_transformer, ord_cols),
    ('nominal_preprocess', nominal_transformer, nom_cols)
], remainder = 'drop')

## Combining transformers

Combining:<br>

1. Feature Extraction.
2. Preprocessing.
3. Model Building.

In [None]:
model = Pipeline(steps=[
    ('feature extract', feature_transformer),
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier())
])

In [None]:
model

## Model training

Using GradientBoost Classifier (gdb).

In [None]:
print("Baseline Accuracy : ", cross_val_score(model, X, y, cv = 10).mean())

In [None]:
model.fit(X,y)

## Preparing for submission

In [None]:
test_ids = test['PassengerId']
sub = pd.concat([test_ids, pd.Series(model.predict(test), name = 'Transported')], axis = 1)

In [None]:
sub

In [None]:
sub.to_csv('submission.csv', index = False)