___

# Introduction

<div>
<img src="https://storage.googleapis.com/kaggle-media/competitions/Spaceship%20Titanic/joel-filipe-QwoNAhbmLLo-unsplash.jpg" width="500"/>
</div>


## Overview

In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.

## Files descriptions

* `train.csv` - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
* `test.csv` - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. 
* `sample_submission.csv` - A submission file in the correct format.

## Field describtion

* `PassengerId` - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

* `HomePlanet` - The planet the passenger departed from, typically their planet of permanent residence.

* `CryoSleep` - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

* `Cabin` - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
       
* `Destination` - The planet the passenger will be debarking to.
        
* `Age` - The age of the passenger.

* `VIP` - Whether the passenger has paid for special VIP service during the voyage.

* `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

* `Name` - The first and last names of the passenger.

* `Transported` - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

## Objective

The task is to predict the value of `Transported` for the passengers in this set.

## Setup

In [None]:
import warnings
warnings.simplefilter("ignore", UserWarning)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from xgboost import XGBClassifier
from pandas.api.types import is_numeric_dtype, is_categorical_dtype
from scipy.stats import chi2_contingency
from catboost import CatBoostClassifier

from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.compose import make_column_selector 
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, RidgeClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import StackingClassifier

from IPython.display import display, Markdown, Latex

In [None]:
# matplotlib
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

# seaborn
sns.set(font_scale = 1.2)
sns.set_style("whitegrid")
sns.set_palette("rocket", 8, .75)

# sklearn
set_config(display="diagram")

In [None]:
TRAIN_PATH = '../input/spaceship-titanic/train.csv'
TEST_PATH = '../input/spaceship-titanic/test.csv'
SUBMISSION_FILE = 'submission.csv'

RANDOM_STATE = 2022
SAMPLE_SIZE = 1.0
TEST_SIZE = 0.3
SIGNICIFANT_LEVEL = 0.05

INDEX = 'PassengerId'
TARGET = 'Transported'
NUM_FOLDS = 3
N_CLUSTERS = 5

## Helper functions

In [None]:
def bar_percent(ax, N, size=10):
    """
    """
    for p in ax.patches:
        x, height, width = p.get_x(), p.get_height(), p.get_width()
        ax.text(
            x + width / 2, 
            height + 85, 
            f'{height / N * 100:2.1f}%', 
            va='center', 
            ha='center', size=size)

In [None]:
def get_num_features(data):
    return list(data.select_dtypes(include=[np.number]).columns)

def get_cat_features(data):
    features = list(data.select_dtypes(include=['category', 'bool']).columns)
    features.remove(TARGET)
    return features

def get_obj_features(data):
    return list(data.select_dtypes(include=[object]).columns)

___

# Explore data

## Read data

In [None]:
def read_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
    train_data = pd.read_csv(TRAIN_PATH, index_col=INDEX)
    test_data = pd.read_csv(TEST_PATH, index_col=INDEX)
    
    return train_data, test_data

train_data, test_data = read_data()

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.describe().T

In [None]:
test_data.describe().T

## Missing values

In [None]:
def total_missing_values(train_df, test_df):
    df = pd.DataFrame({
        'data_set': ['train', 'test'],
        'missing': [ train_df.isna().sum().sum(), test_df.isna().sum().sum()],
    }).set_index('data_set')

    df['missing_%'] = df['missing'] / len(train_df) 
    return df
    
total_missing_values(train_data, test_data)    

In [None]:
missing_values = pd.DataFrame({
    'missing_train': train_data.isna().sum(),
    'missing_test': test_data.isna().sum()
})

missing_values['missing_train_%'] = (missing_values['missing_train'] / len(train_data)) * 100.0 
missing_values['missing_test_%'] = (missing_values['missing_test'] / len(test_data)) * 100.0 

missing_values

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))

sns.heatmap(train_data.isna().T, cmap='Reds', cbar=False, ax=ax1);
ax1.set_title('Missing values (train data set)')

sns.heatmap(test_data.isna().T, cmap='Reds', cbar=False, ax=ax2);
ax2.set_title('Missing values (test data set)')

plt.tight_layout()
plt.show()

In [None]:
def fill_missing_values(data):
    num_features = [
        'RoomService', 
        'FoodCourt', 
        'ShoppingMall', 
        'Spa', 
        'VRDeck']
    
    for feature in num_features:
        has_value = (data[feature] > 0.01) & (data[feature].notna())
        data[f'Has{feature}'] = has_value
        data[feature].fillna(value=0, inplace=True)
        
    data['HasAge'] = data['Age'].notna()
    data['Age'].fillna(value=28, inplace=True) # 28 == mean
    
    cat_features = [
        'HomePlanet', 
        'CryoSleep', 
        'Cabin', 
        'Destination']
    
    for feature in cat_features:
        data[f'Has{feature}'] = data[feature].notna()
    
    # fill with most frequent
    data['HomePlanet'].fillna(value='Earth', inplace=True)
    data['Destination'].fillna(value='TRAPPIST-1e', inplace=True)
    data['CryoSleep'].fillna(value=False, inplace=True)
        
    data['HasCabin'] = data['Cabin'].notna()
    data['Cabin'].fillna(value='X/0/X', inplace=True)
    
    data['HasName'] = data['Name'].notna()
    data['Name'].fillna(value='<known> <known>', inplace=True)
    
    data['HasVIP'] = data['VIP'].notna()
    data['VIP'].fillna(value=False, inplace=True)
    
    return data

In [None]:
train_data = fill_missing_values(train_data)
train_data.head()

In [None]:
imputer_transformer = FunctionTransformer(fill_missing_values)

In [None]:
train_data = imputer_transformer.fit_transform(train_data)
test_data = imputer_transformer.transform(test_data)

pd.DataFrame({
    'train': train_data.isna().sum(),
    'test': train_data.isna().sum()
})  

## Data cleaning

### Split `Cabin`
deck/num/side, where side can be either `P` for Port or `S` for Starboard.

In [None]:
def split_cabin(data):
    data['Deck'] = data['Cabin'].map(lambda x: x[0])
    data['Num'] = data['Cabin'].map(lambda x: x[2:-2])
    data['Side'] = data['Cabin'].map(lambda x: x[-1])

    return data

cabin_transformer = FunctionTransformer(split_cabin)

In [None]:
train_data = cabin_transformer.fit_transform(train_data)
train_data[['Cabin', 'Deck', 'Num', 'Side']].head()

### Split `Name`

In [None]:
def split_name(data):
    data['FirstName'] = data['Name'].map(lambda x: x.split()[0].strip())
    data['LastName'] = data['Name'].map(lambda x: x.split()[1].strip())
    
    return data

name_transformer = FunctionTransformer(split_name)

In [None]:
train_data = name_transformer.fit_transform(train_data)
train_data[['Name', 'FirstName', 'LastName']].head()

### Log-transformation

In [None]:
num_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
fig, axis = plt.subplots(1, 5, figsize=(20, 3))

for f, ax in zip(num_features, axis.flatten()):
    sns.histplot(
        data=train_data, 
        x=f, 
        hue=TARGET, 
        bins=35, 
        legend=True, 
        palette='rocket',
        alpha=0.85,
        kde=False, ax=ax)

plt.tight_layout()
plt.show()

In [None]:
def log1p(feature):
    def func(data):
        data[f'Log{feature}'] = np.log1p(data[feature])
        return data
        
    return func    

In [None]:
log_transformer = make_pipeline(
    FunctionTransformer(log1p(feature='RoomService')),
    FunctionTransformer(log1p(feature='FoodCourt')),
    FunctionTransformer(log1p(feature='ShoppingMall')),
    FunctionTransformer(log1p(feature='Spa')),
    FunctionTransformer(log1p(feature='VRDeck'))
)

### Convert data types

In [None]:
def convert_dtypes(data, inplace=True):
    df = data if inplace else data.copy()

    num_features = [
        'Age', 
        'RoomService', 
        'FoodCourt', 
        'ShoppingMall', 
        'Spa', 
        'VRDeck']

    cat_features = [
        'HomePlanet', 
        'Destination', 
        'VIP', 
        'CryoSleep', 
        'Deck', 
        'Side']
    
    str_features = [
        'Cabin', 
        'Name', 
        'FirstName', 
        'LastName', 
        'Name',
        'Num']
    
    df[num_features] = df[num_features].astype(np.float32)
    
    for f in cat_features:
        df[f] = pd.Categorical(df[f], ordered=False)

    for f in  str_features:
        df[f] = df[f].astype(str)
        
    return df

In [None]:
convert_dtypes_transformer = FunctionTransformer(convert_dtypes)
train_data = convert_dtypes_transformer.fit_transform(train_data)

pd.DataFrame({'dtype': train_data.dtypes})

## Data cleaning piepline

In [None]:
data_cleaning = make_pipeline(
    imputer_transformer,
    cabin_transformer,
    name_transformer,
    log_transformer,
    convert_dtypes_transformer
)

___

# Exploratory data analysis (EDA)

In [None]:
# reload data
train_data, test_data = read_data()

train_data = data_cleaning.fit_transform(train_data)
train_data.head(3)

In [None]:
def display_info(data, feature):
    info = np.round(data[feature].describe(), 3)
    format_str = '* count: {}\n* mean: {}\n* std: {}\n* min: {}\n* 25%: {}\n* 50%: {}\n* 75%: {}\n* max: {}'
    
    display(Markdown(f'#### Statistical summary `{feature}`'))
    display(Markdown(format_str.format(
        len(data), 
        info['mean'], 
        info['std'], 
        info['min'], 
        info['25%'], 
        info['50%'], 
        info['75%'], 
        info['max'])))

In [None]:
def plot_hist(data, feature, ax=None, target=TARGET):
    if ax is None:
        ax = plt.gca()
    
    sns.histplot(
        data=data, 
        x=feature, 
        hue=target, 
        bins=35, 
        legend=True, 
        palette='rocket',
        alpha=0.75,
        kde=True, ax=ax)

    ax.set_title(f'Histtogram {feature}')

In [None]:
def plot_stackbar(data, feature1, feature2, ax=None):
    if ax is None:
        ax = plt.gca()

    sns.histplot(
        data=data,
        x=feature1, 
        hue=feature2,
        shrink=.8,
        palette='rocket',
        alpha=0.75,
        ax=ax)
    ax.set_title(f'{feature1} vs {feature2}')

In [None]:
def plot_violin(data, feature, ax=None, target=TARGET):
    if ax is None:
        ax = plt.gca()

    sns.violinplot(
        data=data, 
        x=target, 
        y=feature,
        hue=target, 
        alpha=0.75,
        palette='rocket', ax=ax)
    
    ax.set_title(f'Violinplot {feature}')

In [None]:
def plot_boxplot(data, x=None, y=None, ax=None, target=TARGET):
    if ax is None:
        ax = plt.gca()

    sns.boxplot(
        data=data, 
        x=x, 
        y=y, 
        hue=target, 
        boxprops=dict(alpha=.75),
        palette='rocket', ax=ax)

    ax.set_title(f'Boxplot {x}')

In [None]:
def plot_count(data, feature, target=TARGET, ax=None, percent=True):
    if ax is None:
        ax = plt.gca()

    sns.countplot(
        data=data, 
        x=feature,
        hue=target,
        palette='rocket', 
        alpha=0.75,
        ax=ax)
    
    if percent:
        bar_percent(ax, N=len(data))    
        
    ax.set_title(f'Count {feature}')

In [None]:
def plot_kde(data, num_feature, cat_feature, target=TARGET, ax=None, legend=False):
    if ax is None:
        ax = plt.gca()

    sns.kdeplot(
        data=data, 
        x=num_feature, 
        hue=cat_feature, 
        fill=True,
        legend=legend, 
        palette='rocket',
        alpha=0.4, ax=ax)
     
    ax.set_title(f'Count {feature}')

## Target `Transported`

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 5))

plot_count(train_data, TARGET, target=None, ax=ax)
ax.set_title('Transpored (Target)')

plt.tight_layout()
plt.show()

## Numerical features

In [None]:
num_features = ['Age', 'LogRoomService', 'LogFoodCourt', 'LogShoppingMall', 'LogSpa', 'LogVRDeck']
train_data[num_features].describe().round(3).T

In [None]:
num_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
fig, axis = plt.subplots(1, 6, figsize=(25, 4))

for f, ax in zip(num_features, axis.flatten()):
    df = train_data[train_data[f'Has{f}']]
    plot_hist(df, f'Log{f}', ax=ax)
    
plot_hist(train_data, 'Age', ax=axis[5])    

plt.tight_layout()
plt.show()

### Outlier detection

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(18, 4))

num_features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for feature, ax in zip(num_features, axis.flatten()):
    df = train_data[train_data[f'Has{f}']]
    sns.boxplot(
        data=df, 
        x=f'Log{feature}',
        ax=ax,
        boxprops=dict(alpha=.75),
        palette='rocket')

plt.tight_layout()
plt.show()

### Feature `Age`

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5))

feature='Age'
data = train_data[train_data['HasAge']]

plot_hist(data, feature, ax=ax1)
plot_violin(data, feature, ax=ax2)
plot_boxplot(data, x=TARGET, y=feature, ax=ax3)
plot_count(data, 'HasAge', ax=ax4)

plt.tight_layout()
plt.show()

display_info(data, 'Age')

### Feature `RoomService`

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5))

feature='LogRoomService'
data = train_data[train_data['HasRoomService']]

plot_hist(data, feature, ax=ax1)
plot_violin(data, feature, ax=ax2)
plot_boxplot(data, x=TARGET, y=feature, ax=ax3)
plot_count(data, 'HasRoomService', ax=ax4)
 
plt.tight_layout()
plt.show()

display_info(data, 'LogRoomService')

### Feature `FoodCourt`

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5))

feature='LogFoodCourt'
data = train_data[train_data['HasFoodCourt']]

plot_hist(data, feature, ax=ax1)
plot_violin(data, feature, ax=ax2)
plot_boxplot(data, x=TARGET, y=feature, ax=ax3)
plot_count(data, 'HasFoodCourt', ax=ax4) 
    
plt.tight_layout()
plt.show()

display_info(data, 'LogFoodCourt')

### Feature `ShoppingMall`

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5))

feature='LogShoppingMall'
data = train_data[train_data['HasShoppingMall']]

plot_hist(data, feature, ax=ax1)
plot_violin(data, feature, ax=ax2)
plot_boxplot(data, x=TARGET, y=feature, ax=ax3)
plot_count(data, 'HasShoppingMall', ax=ax4) 

plt.tight_layout()
plt.show()

display_info(train_data, 'LogShoppingMall')

### Feature `Spa`

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5))

feature='LogSpa'
data = train_data[train_data['HasSpa']]

plot_hist(data, feature, ax=ax1)
plot_violin(data, feature, ax=ax2)
plot_boxplot(data, x=TARGET, y=feature, ax=ax3)
plot_count(data, 'HasSpa', ax=ax4) 

plt.tight_layout()
plt.show()

display_info(data, 'LogSpa')

### Feature `VRDeck`

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20, 5))

feature='LogVRDeck'
data = train_data[train_data['HasVRDeck']]

plot_hist(data, feature, ax=ax1)
plot_violin(data, feature, ax=ax2)
plot_boxplot(data, x=TARGET, y=feature, ax=ax3)
plot_count(data, 'HasVRDeck', ax=ax4) 

plt.tight_layout()
plt.show()

display_info(train_data, 'LogVRDeck')

## Categorical features

### Feature `HomePlanet`

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
plot_count(train_data, 'HomePlanet', ax=ax1)
plot_count(train_data, 'HasHomePlanet', ax=ax2)

plt.tight_layout()
plt.show()

### Feature `CryoSleep`

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
plot_count(train_data, 'CryoSleep', ax=ax1)
plot_count(train_data, 'HasCryoSleep', ax=ax2)

plt.tight_layout()
plt.show()

### Feature `Deck`

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(20, 5))
plot_count(train_data, 'Deck', ax=ax1)
plot_count(train_data, 'HasCabin', ax=ax2)

plt.tight_layout()
plt.show()

### Feature `Side`

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
plot_count(train_data, 'Side', ax=ax1)
plot_count(train_data, 'HasCabin', ax=ax2)

plt.tight_layout()
plt.show()

### Feature `Destination`

In [None]:
fig,(ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
plot_count(train_data, 'Destination', ax=ax1)
plot_count(train_data, 'HasDestination', ax=ax2)

plt.tight_layout()
plt.show()

## Unique values

In [None]:
cat_features = ['HomePlanet', 'Destination', 'VIP', 'Destination', 'Deck', 'Side', 'CryoSleep']
pd.DataFrame(train_data[cat_features].nunique(), columns=['count'])

## Correlation

### Numerical features

In [None]:
num_features = ['Age', 'LogRoomService', 'LogFoodCourt', 'LogShoppingMall', 'LogSpa', 'LogVRDeck']
num_corr = train_data[num_features + [TARGET]].corr()

fig, ax = plt.subplots(1, 1, figsize=(9, 9))
mask = np.triu(np.ones_like(num_corr, dtype=bool))
sns.heatmap(
    num_corr, 
    mask=mask, 
    cmap='rocket', 
    vmax=.3, 
    center=0,
    square=True, 
    linewidths=.1,
    ax=ax,
    alpha=0.85,
    annot = True,
    fmt='.1g',
    cbar_kws={"shrink": .5})

plt.tight_layout()
plt.show()

In [None]:
grid = sns.pairplot(
    data=train_data[num_features + [TARGET]], 
    hue=TARGET,
    palette='rocket',
    corner=True)

grid.fig.set_size_inches(15, 15)

fig.tight_layout()
plt.show()

### Numerical and Categorical features

In [None]:
num_features = ['Age', 'LogRoomService', 'LogFoodCourt', 'LogShoppingMall', 'LogSpa', 'LogVRDeck']
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side']

def plot_relation_cat_num(data, cat_feature, num_features):
    fig, axis = plt.subplots(1, 6, figsize=(30, 4))
    
    for feature, ax, i in zip(num_features, axis.flatten(), range(1, 7)):
        plot_kde(data, feature, cat_feature, target=TARGET, ax=ax, legend=(i==6))
        ax.set_title(f'{cat_feature} vs {feature}')
        
    plt.tight_layout()
    plt.show()        

    fig, axis = plt.subplots(1, 6, figsize=(30, 4))
    for feature, ax, i in zip(num_features, axis.flatten(), range(1, 7)):
        plot_boxplot(data, x=cat_feature, y=feature, ax=ax)
        ax.set_title(f'{cat_feature} vs {feature}')
        
        if (i!=6):
            ax.get_legend().remove()

    plt.tight_layout()
    plt.show()        
    
    fig, axis = plt.subplots(1, 6, figsize=(30, 4))            
    for feature, ax in zip(cat_features, axis.flatten()):
        plot_stackbar(data, cat_feature, feature, ax=ax)
        ax.set_title(f'{cat_feature} vs {feature}')
        
    plt.tight_layout()
    plt.show()            

In [None]:
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side', 'Transported']
def chi2_check(data, feature, features=cat_features):
    chi2_check = []
    p_values = []

    for feat in cat_features:
        cross_result = pd.crosstab(
            index=data[feature], 
            columns=data[feat])
        
        p_value = chi2_contingency(cross_result)[1]
        p_values.append(p_value)

        if p_value < SIGNICIFANT_LEVEL:
            chi2_check.append('Reject') # reject H_0
        else:
            chi2_check.append('Accept') # accept H_0


    df = pd.DataFrame(data = [features, p_values, chi2_check]).T 
    df.columns = ['Feature', 'p-Value', 'Hypothesis']

    return df.set_index('Feature')

* Chi-Square Test of Independence

    * Null Hypothesis ($H_0$): There is no significant relationship between the variables.

    * Alternative Hypothesis ($H_1$): There is a significant relationship between variables.

#### Feature `HomePlanet` vs other features

In [None]:
plot_relation_cat_num(train_data, 'HomePlanet', num_features)
chi2_check(train_data, 'HomePlanet')

#### Feature `CryoSleep` vs other features

In [None]:
plot_relation_cat_num(train_data, 'CryoSleep', num_features)
chi2_check(train_data, 'CryoSleep')

#### Feature `Destination` vs other features

In [None]:
plot_relation_cat_num(train_data, 'Destination', num_features)
chi2_check(train_data, 'Destination')

#### Feature `VIP` vs other features

In [None]:
plot_relation_cat_num(train_data, 'VIP', num_features)
chi2_check(train_data, 'VIP')

#### Feature `Deck` vs other features

In [None]:
plot_relation_cat_num(train_data, 'Deck', num_features)
chi2_check(train_data, 'Deck')

#### Feature `Side` vs other features

In [None]:
plot_relation_cat_num(train_data, 'Side', num_features)
chi2_check(train_data, 'Side')

___

# Feature engineering (FE)

In [None]:
# reload data
train_data, test_data = read_data()

train_data = data_cleaning.fit_transform(train_data)
test_data = data_cleaning.transform(test_data)

## Add `IsUpperDeck`

In [None]:
def add_is_upper_deck(data):
    data['IsUpperDeck'] = pd.Categorical(
        data['Deck'].str.contains('F') + 
        data['Deck'].str.contains('G'), ordered=False)
    
    return data

In [None]:
train_data = add_is_upper_deck(train_data)
train_data[['Deck', 'IsUpperDeck']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
plot_count(train_data, 'IsUpperDeck', ax=ax)

plt.tight_layout()
plt.show()

## Releation between `Deck` and `CryoSleep`

In [None]:
df = train_data[['Deck', 'CryoSleep', TARGET]][train_data['CryoSleep'] == True]

fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plot_count(df, 'Deck', ax=ax)

plt.tight_layout()
plt.show()

In [None]:
def add_deck_cryo_sleep(data):
    data['DeckCryoSleep'] = data['CryoSleep'].astype(bool) * (
        data['Deck'].str.contains('A') +   
        data['Deck'].str.contains('B') +
        data['Deck'].str.contains('C') + 
        data['Deck'].str.contains('F') +
        data['Deck'].str.contains('E') +
        data['Deck'].str.contains('X'))

    data['DeckCryoSleep'] = pd.Categorical(data['DeckCryoSleep'], ordered=False)
    return data

In [None]:
train_data = add_deck_cryo_sleep(train_data)
train_data[['CryoSleep', 'Deck', 'DeckCryoSleep']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
plot_count(train_data, 'DeckCryoSleep', ax=ax)

plt.tight_layout()
plt.show()

## Releation between `Deck` and `VIP`

In [None]:
df = train_data[train_data['VIP'] == True]

fig, ax = plt.subplots(1, 1, figsize=(12, 5))
plot_count(df, 'Deck', ax=ax, percent=False)

plt.tight_layout()
plt.show()

In [None]:
def add_deck_vip(data):
    data['DeckVIP'] = data['VIP'].astype(bool) * (
        data['Deck'].str.contains('A') +   
        data['Deck'].str.contains('B') +
        data['Deck'].str.contains('C') + 
        data['Deck'].str.contains('F'))

    data['DeckVIP'] = pd.Categorical(data['DeckVIP'], ordered=False)
    return data

In [None]:
train_data = add_deck_vip(train_data)
train_data[['VIP', 'Deck', 'DeckVIP']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))

df = train_data[train_data['VIP'] == True]
plot_count(df, 'DeckVIP', ax=ax, percent=False)

plt.tight_layout()
plt.show()

## Discretization feature `Age`

In [None]:
def discretization_age(data):
    bins = [0, 2, 13, 21, 60, np.inf]
    labels = ['Infant', 'Kid', 'Teen', 'Adult', 'Senior']

    age_transformer = FunctionTransformer(
        pd.cut, kw_args={'bins': bins, 'labels': labels, 'retbins': False}
    )

    data['AgeClass'] = age_transformer.fit_transform(data['Age'])
    return data

In [None]:
train_data = discretization_age(train_data)
train_data[['Age', 'AgeClass']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

plot_count(train_data, 'AgeClass', ax=ax)
ax.set_title('Passenger Age')

plt.tight_layout()
plt.show()

## Discretization feature `RoomService`

In [None]:
def discretization_room_service(data):
    data['RoomServiceClass'] = pd.qcut(
        data['LogRoomService'], 
        q=[0, .7, .9, 1.], 
        labels=['Low', 'Medium', 'High'], duplicates='drop')

    return data

train_data = discretization_room_service(train_data)
train_data[['LogRoomService', 'RoomServiceClass']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
plot_count(train_data, 'RoomServiceClass', ax=ax)

plt.tight_layout()
plt.show()

## Discretization feature `FoodCourt`

In [None]:
def discretization_food_court(data):
    data['FoodCourtClass'] = pd.qcut(
        data['LogFoodCourt'], 
        q=[0, .7, .9, 1.], 
        labels=['Low', 'Medium', 'High'], duplicates='drop')

    return data

train_data = discretization_food_court(train_data)
train_data[['LogFoodCourt', 'FoodCourtClass']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
plot_count(train_data, 'FoodCourtClass', ax=ax)

plt.tight_layout()
plt.show()

## Discretization feature `ShoppingMall`

In [None]:
def discretization_shopping_mall(data):
    data['ShoppingMallClass'] = pd.qcut(
        data['LogShoppingMall'], 
        q=[0, .7, .9, 1.], 
        labels=['Low', 'Medium', 'High'], duplicates='drop')

    return data

train_data = discretization_shopping_mall(train_data)
train_data[['LogShoppingMall', 'ShoppingMallClass']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
plot_count(train_data, 'ShoppingMallClass', ax=ax)
            
plt.tight_layout()
plt.show()

## Discretization feature `Spa`

In [None]:
def discretization_spa(data):
    data['SpaClass'] = pd.qcut(
        data['LogSpa'], 
        q=[0, .7, .9, 1.], 
        labels=['Low', 'Medium', 'High'], duplicates='drop')

    return data

train_data = discretization_spa(train_data)
train_data[['LogSpa', 'SpaClass']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
plot_count(train_data, 'SpaClass', ax=ax)

plt.tight_layout()
plt.show()

## Discretization feature `VRDeck`

In [None]:
def discretization_vr_deck(data):
    data['VRDeckClass'] = pd.qcut(
        data['LogVRDeck'], 
        q=[0, .7, .9, 1.], 
        labels=['Low', 'Medium', 'High'], duplicates='drop')

    return data

train_data = discretization_vr_deck(train_data)
train_data[['LogVRDeck', 'VRDeckClass']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))
plot_count(train_data, 'VRDeckClass', ax=ax)
7
plt.tight_layout()
plt.show()

## Add `LogTotalSpend` feature

In [None]:
def add_log_total_spend(data):
    data['LogTotalSpend'] = np.log1p(
        data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck'])
    
    features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for f in features:
        data[f'LogRel{f}'] = data[f'Log{f}'] / data['LogTotalSpend']
        data[f'LogRel{f}'].fillna(0, inplace=True)

    data['LogTotalSpend'].fillna(0, inplace=True)
        
    return data

train_data = add_log_total_spend(train_data)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(4, 4))
plot_boxplot(train_data, x=TARGET, y='LogTotalSpend', ax=ax, target=None)
ax.set_title('Boxplot LogTotalSpend')

plt.tight_layout()
plt.show()

In [None]:
features = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
df = train_data[train_data['LogTotalSpend'] > 0]

fig, axis = plt.subplots(1, 5, figsize=(20, 4))

for f, ax in zip(features, axis.flatten()):
    plot_boxplot(df, x=TARGET, y=f'LogRel{f}', target=None, ax=ax)
    ax.set_title(f'Boxplot LogRel{f}')

plt.tight_layout()
plt.show()

## Add features `PassengerGroup` and `NumberPassenger`

In [None]:
def split_passenger(data, inplace=True):
    """Split the feature `PassengerId` into `Group` and `Number`
    """
    df = data if inplace else data.copy()

    df['PassengerGroup'] = df.index.map(lambda x: x[0:4])
    df['PassengerNumber'] = df.index.map(lambda x: x[5:7]).map(np.int16)
    df['PassengerNumber'] = df['PassengerNumber'].astype(int)
    
    return df

In [None]:
split_passenger(train_data, inplace=True)[['PassengerGroup', 'PassengerNumber']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 5))

plot_count(train_data, 'PassengerNumber', ax=ax)
ax.set_title('Passenger group frequency')

plt.tight_layout()
plt.show()

## Add feature `GroupSize`

In [None]:
def add_group_size(data):
    data.drop(['GroupSize'], axis=1, inplace=True, errors='ignore')
    
    df = data[['PassengerGroup', 'PassengerNumber']].groupby(by='PassengerGroup').max()
    df.columns = ['GroupSize']

    df = data.reset_index().merge(df, 
        how='left', 
        left_on='PassengerGroup', 
        right_on='PassengerGroup')

    return df.set_index(INDEX)

train_data = add_group_size(train_data)
train_data[['PassengerGroup', 'GroupSize']].head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 5))
sns.countplot(
    data=train_data, 
    x='GroupSize',
    hue=TARGET,
    palette='rocket',
    alpha=0.85,
    ax=ax)

bar_percent(ax, N=len(train_data), size=12)
ax.set_title('Passengers group size')

plt.tight_layout()
plt.show()

## Add feature `TravelAlone`

In [None]:
def add_travel_alone(data, inplace=True):
    df = data if inplace else data.copy()
    df['TravelAlone'] = df['GroupSize']  == 1

    return df

train_data = add_travel_alone(train_data)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))

plot_count(train_data, 'TravelAlone', ax=ax)
ax.set_title('Number of passengers who travels alone')

plt.tight_layout()
plt.show()

## Add freature `LastNameFreq`

In [None]:
def add_lastname_freq(data):
    data.drop(['LastNameFreq'], axis=1, inplace=True, errors='ignore')
    
    df = data[['Name', 'LastName']].groupby(by='LastName').count()
    df.columns = ['LastNameFreq']

    df = data.reset_index().merge(df, 
        how='left', 
        left_on='LastName', 
        right_on=df.index).set_index(INDEX)

    return df 

train_data = add_lastname_freq(train_data)
train_data[['LastName', 'LastNameFreq']].head()

## Add feature `IsFamilyMember`

In [None]:
def add_is_family_member(data):
    data.drop(['IsFamilyMember'], axis=1, inplace=True, errors='ignore')
    
    df = data[data['LastName'] != '<known>']
    df = df.reset_index()[['PassengerGroup', 'LastName', 'PassengerNumber', 'Cabin']]
    df = df.groupby(by=['PassengerGroup', 'LastName', 'Cabin']).count()

    df = df[df['PassengerNumber'] > 2]
    df.drop(['PassengerNumber'], axis=1, inplace=True)

    df['IsFamilyMember'] = True 
    df = data.reset_index().merge(df.reset_index(), how='left', 
            on=['PassengerGroup', 'LastName', 'Cabin']) 

    df['IsFamilyMember'].fillna(False, inplace=True)
    df.set_index(INDEX, inplace=True)

    return df

train_data = add_is_family_member(train_data)
train_data[train_data['IsFamilyMember']][[
    'LastName', 'FirstName', 'Age', 'PassengerGroup', 'Cabin']].head(10)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
plot_count(train_data, 'IsFamilyMember', ax=ax)

plt.tight_layout()
plt.show()

## Add feature `IsCouple`

In [None]:
def add_is_couple(data):
    data.drop(['IsCouple'], axis=1, inplace=True, errors='ignore')
    
    df = data[data['LastName'] != '<known>']
    df = df.reset_index()[['PassengerGroup', 'LastName', 'PassengerNumber', 'Cabin']]
    df = df.groupby(by=['PassengerGroup', 'LastName', 'Cabin']).count()

    df = df[df['PassengerNumber'] == 2]
    df.drop(['PassengerNumber'], axis=1, inplace=True)

    df['IsCouple'] = True 
    df = data.reset_index().merge(df.reset_index(), how='left', 
            on=['PassengerGroup', 'LastName', 'Cabin']) 

    df['IsCouple'].fillna(False, inplace=True)
    df.set_index(INDEX, inplace=True)

    return df

train_data = add_is_couple(train_data)
train_data[train_data['IsCouple']][['LastName', 'FirstName', 'PassengerGroup', 'Cabin']].head(10)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7, 5))
plot_count(train_data, 'IsCouple', ax=ax)

plt.tight_layout()
plt.show()

## Drop features

In [None]:
def drop_unused_features(data):
    droped_features = [
        'Cabin', 
        'Name', 
        'Num', 
        'FirstName', 
        'LastName', 
        'RoomService', 
        'FoodCourt', 
        'ShoppingMall', 
        'Spa', 
        'VRDeck', 'PassengerGroup']
    
    data.drop(droped_features,  errors='ignore', inplace=True, axis=1) 
    return data

## Feature pipeline

In [None]:
feature_engineering = make_pipeline(
    FunctionTransformer(add_is_upper_deck),
    FunctionTransformer(add_deck_cryo_sleep),
    FunctionTransformer(discretization_age),
    FunctionTransformer(discretization_room_service),
    FunctionTransformer(discretization_food_court),
    FunctionTransformer(discretization_shopping_mall),
    FunctionTransformer(discretization_spa),
    FunctionTransformer(discretization_vr_deck),
    FunctionTransformer(split_passenger),
    FunctionTransformer(add_group_size),
    FunctionTransformer(add_travel_alone),
    FunctionTransformer(add_lastname_freq),
    FunctionTransformer(add_log_total_spend),
    FunctionTransformer(add_is_family_member),
    FunctionTransformer(add_is_couple),
    FunctionTransformer(drop_unused_features)
)

In [None]:
test_data = feature_engineering.transform(test_data)
test_data.head()

___

# Feature selection (FS)

In [None]:
# reload data
train_data, test_data = read_data()

pipe = make_pipeline(
    data_cleaning,
    feature_engineering
)

train_data = pipe.fit_transform(train_data)
test_data = pipe.transform(test_data)

pd.DataFrame({'dtype': train_data.dtypes})

Coming soon ...

___

# Model building

In [None]:
# reload data
train_data, test_data = read_data()
features = list(train_data.columns.difference([TARGET]))

## Splitting data

In [None]:
data = train_data.sample(frac=SAMPLE_SIZE).copy()
X_train, X_val, y_train, y_val = train_test_split(
    data[features],
    data[TARGET].astype(int),
    test_size=TEST_SIZE, 
    random_state=RANDOM_STATE)

In [None]:
pd.DataFrame({
    'Rows': [X_train.shape[0], X_val.shape[0]],
    'Dataset': ['Train', 'Validation']
}).set_index('Dataset')

## Model pipeline

In [None]:
num_transform = make_pipeline(
    StandardScaler()
)

cat_transform = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=False)
)

preprocessor = make_pipeline(
    data_cleaning,
    feature_engineering,
    make_column_transformer(
        (num_transform, make_column_selector(dtype_include='number')),
        (cat_transform, make_column_selector(dtype_include=['category', 'bool']))
    )
)

In [None]:
def plot_model_result(y_pred, y_true):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))

    ConfusionMatrixDisplay.from_predictions(
        y_true, 
        y_pred, 
        ax=ax1, 
        cmap='Reds',
        normalize='true',
        colorbar=False)
    ax1.set_title('Confusion matrix')

    RocCurveDisplay.from_predictions(
        y_true, 
        y_pred, 
        ax=ax2)
    ax2.set_title('ROC')

    plt.tight_layout()
    plt.show()

    print(classification_report(y_true, y_pred))

In [None]:
classifiers = {
    'lr': LogisticRegression(C=0.2, solver='liblinear'),
    'sgd': SGDClassifier(loss='log'),
    'ride': RidgeClassifier(),
    'dt': DecisionTreeClassifier(
        max_depth=7),
    'rf': RandomForestClassifier(
        max_depth=7, 
        n_estimators=100),
    'ada': AdaBoostClassifier(
        n_estimators=100),
    'cat': CatBoostClassifier(
        iterations=10, 
        learning_rate=0.1, verbose=False),
    'lgb': lgb.LGBMClassifier(
        learning_rate=0.05,
        n_estimators=500,
        reg_lambda = 1),
    'xgb': XGBClassifier(
        n_estimators=500,
        use_label_encoder=False,
        eval_metric='rmse')
}

classifiers['stack'] = StackingClassifier(
    [(k, m) for k,m in classifiers.items()], 
    final_estimator=LogisticRegression())

In [None]:
scores = pd.DataFrame(
    np.zeros((len(classifiers), NUM_FOLDS)), 
    index=classifiers.keys(), 
    columns=range(1, NUM_FOLDS+1))

models = dict()

In [None]:
for name, cls in classifiers.items():
    display(Markdown('### Model `{}`'.format(name)))
    
    models[name] = model = make_pipeline(
        preprocessor,
        classifiers[name]
    ).fit(X_train.copy(), y_train)

    y_pred = model.predict(X_val.copy())
    plot_model_result(y_pred, y_val)
    
    X_trans = preprocessor.fit_transform(X_train.copy())
    scores.loc[name] = cross_val_score(
        classifiers[name], 
        X_trans.copy(), y_train, cv=NUM_FOLDS)

### Model Results

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10, 4))    
sns.boxplot(
    data=scores.T, 
    ax=ax,
    boxprops=dict(alpha=.75),
    palette='rocket')

ax.set_title(f'Model Results')
ax.set_xlabel('Model')
ax.set_ylabel('Accuracy')

plt.tight_layout()
plt.show()

___

# Model selection

In [None]:
pd.DataFrame({
    'Model': scores.index,
    'Score': scores.mean(axis=1),
    'Std': scores.std(axis=1)
}).set_index('Model').sort_values(by='Score', ascending=False)

In [None]:
best_model = models[scores.mean(axis=1).idxmax()]
best_model.fit(train_data[features].copy(), train_data[TARGET]);

___

# Submission

In [None]:
y_pred_submission = best_model.predict(test_data).astype(bool)

In [None]:
submission_data = pd.DataFrame({
    INDEX: test_data.index,
    TARGET: y_pred_submission,
}).set_index(INDEX)

submission_data

In [None]:
# save submission file
submission_data.to_csv(SUBMISSION_FILE)

Thank you for reading.