In [None]:
!pip install deep_tabular_augmentation

In [None]:
#Models
#Logistic Regression
from catboost import CatBoostClassifier
from scipy.stats import t
import seaborn as sns
import pandas as pd
import numpy as np

#Dicision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
#Ensenble
from sklearn.ensemble import VotingClassifier,RandomForestClassifier
from  sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score,roc_auc_score,f1_score,precision_score,recall_score
from sklearn.model_selection import KFold,StratifiedKFold
import copy 
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform
import os
import shap 
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import torch

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)


# from https://github.com/lschmiddey/deep_tabular_augmentation/tree/main/deep_tabular_augmentation
import deep_tabular_augmentation as dta 

from functools import partial
from sklearn.preprocessing import StandardScaler

# SpaceShip Classification Task

The main objective of this notebook is a classification task on the SpaceShip database. 
To achieve this goal, the following steps are implemented:
- Loading Data: Prepare the training and validation datasets
- EDA: Generate insights using visualization tools to understand the variables
- Feature Engineering: Apply Transformation on data
- Data Augmentation: Apply a tabular augmentation with deep_tabular_augmentation model: (https://github.com/lschmiddey/deep_tabular_augmentation/tree/main/deep_tabular_augmentation)
- Model: Test diferent models: Catboost, Decision Trees, Random Forests
- Run the Model: Run the models in different combinations of variables with cross validation
- Analysis of results: Check the results of all models and SHAP
- Submission: Use the best trained model to predict the test set

In [None]:
# The objective for this class is to allow easy configuration of hyperparameters in this notebook
class Configuration():
    """Configuration Class for easy parametrization"""
    #Random Seed
    random_state = 666
    #Validation Ratio
    val_ratio = 0.2
    
    #Null values Inputer strategy
    imputer = 'median'
    
    #Select a set of top correlatated variables (person) to test 
    top_vars = 15
    
    #Data Augmentation Parameters:
    epochs = 1000
    increase = 1000 # half 1 half 0
    
    # Cross validation K folds
    folds = 10
    
    #Criteria to select best model for submission by
    selection = 'accuracy_mean'
    
    
CFG = Configuration()

# Load Data
- This part is simple. Just to load the data set and take a first glance at the data and columns

In [None]:
# Loading the competions data
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
# First take a look in some samples of the dataset
train_df.head()

In [None]:
# Checking the categories and some sample of the values
print(f'Spaceshipe dataset:\nTotal Samples: {train_df.shape[0]} - Total Variables: {train_df.shape[1]} \n')
for column in train_df.columns:
    print(f'Variable: {column} - Unique values {train_df[column].nunique()} - Sample:')
    print(train_df[column].unique(), '\n')

# EDA
- In this section is time for a deeper look at the variables 

In [None]:
# Checking if the target is balanced
names = [ (train_df['Transported'].value_counts().index[i],train_df['Transported'].value_counts().values[i] ) for i in range(2)]
size_of_groups= train_df['Transported'].value_counts().values

plt.figure(figsize = (9,5))
# Create a pieplot
plt.pie(size_of_groups, labels = names)

# add a circle at the center to transform it in a donut chart
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Target proportion')

In [None]:
# Checking graphicaly the null values in dataset 
plt.figure(figsize = (12,8))
sns.heatmap(train_df.isnull())
plt.title('Null Values in the dataset')

In [None]:
# Checking the percentage of null values for each variable
plt.figure(figsize = (12,8))
(train_df.isnull().mean().sort_values(ascending = False) *100).plot(kind = 'barh')
plt.title('Percentage of Null Values in the dataset')

If we chose to drop all rows with any nulls in the dataset, the total size of the dataset will be
changed to: 6606

In [None]:
print(f'Removed percentage: {(1 - train_df.dropna().shape[0]/train_df.shape[0] ) * 100}%\n{train_df.dropna().shape[0]}')

In [None]:
#Checking the variables and the target 
def customSummary(data, label, var):
    
    #size of data
    N = len(data)
    
    # Grouping the data and calculating the mean std and confidence inverval
    data = data[[var,label]].groupby(var).agg([len, \
                                               np.mean, \
                                               np.std, \
                                               lambda x: (t.ppf(0.95/2.0 + .5, len(x)-1) \
                                                          * (np.std(x) / np.sqrt(len(x))))])
    
    
    #Reseting index
    data.reset_index(inplace=True)
    data.columns = data.columns.droplevel()
    
    #Renaming Variables
    data.rename(columns = {'<lambda_0>':'interval'}, inplace = True)
    data.rename(columns = {'':var}, inplace = True)
    
    #Calculating the proportion
    data['proportion'] = data['len']/N
    
    return data


def plot_(data, label, variable, size = (8,5)):
    x = customSummary(data,label, variable)
    palette = sns.color_palette('Blues', len(x[variable]))

    fig, ax = plt.subplots(figsize = size)
    ax.bar(x[variable], x['proportion'], color = list(palette))
    ax.errorbar(x[variable], x['mean'], yerr=x['interval'], fmt='-o', color = 'red',capsize=6)
    ax.set_ylabel(ylabel ='Percentage')
    axes2 = ax.twinx()   
    axes2.set_ylabel(ylabel ='Frequence')
    axes2.set_ylim(ymin = 0, ymax = max(x['len']))
    axes2.grid(visible=False)
    fig.suptitle(variable)

In [None]:
# for categorical values, lets check the number of classes and the the correspondent target values
var = ['HomePlanet','CryoSleep','Destination','VIP']

for i in var:
    plot_(train_df, 'Transported', i)

- The numeric variables have a possible difficult distribution. So lets try some different approaches to handle it
- The gauss transformation seems to have a better result

In [None]:
# A custon binarization transformation. Where, anything higer than = 0 goes to 1:
def binarization(sample):
    transformed_sample = 0
    if sample > 0:
        transformed_sample = 1
    return transformed_sample

# List of variables to check
var = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

#Creating different datasets to check the distribuiton of the transformed variables
train_df_t0 = train_df.copy()
train_df_t1 = train_df.copy()
train_df_t2 = train_df.copy()

# gaussing linear transformation
transform_gauss = PowerTransformer()
# a Quantile linear transformation
transform_qt = QuantileTransformer()
# Applying The transformations
train_df_t1[var] = transform_gauss.fit_transform(train_df[var])
train_df_t2[var] = transform_qt.fit_transform(train_df[var])

#Ploting the comparison of the transformations
for v in var:
    fig, ax = plt.subplots(ncols = 3, nrows =1, figsize= (16,5))
    train_df_t0[v] = train_df_t0[v].apply(binarization)
    
    sns.histplot(x = v, data = train_df_t0, ax=ax[0], hue = 'Transported', kde = True)
    sns.histplot(x = v, data = train_df_t1, ax=ax[1], hue = 'Transported', kde = True)
    sns.histplot(x = v, data = train_df_t2, ax=ax[2], hue = 'Transported', kde = True)
    ax[0].set_title('Binary')
    ax[1].set_title('Gauss')
    ax[2].set_title('Quantile')


- Now, lets categorize the age variable to try to find some groups that could be better separeted by the target variable
- It is possible to see that the youngest have a higher probability to survive

In [None]:
# Categorizing age

train_df_t= train_df
train_df_t['Age_d'] = pd.cut(train_df['Age'],bins =[0,10,18,25,40,60,100],
                               labels=[ "child", "teen", "youg adult","adult", "mature", "elder"])


plt.figure(figsize = (12,8))
sns.histplot(x = 'Age', data =train_df, hue = 'Transported')
plot_(train_df_t, 'Transported', 'Age_d', (12,8))

- Now, it is time to check the correlation matrix between all variables 
- Higher values means positive correlation - if variable increase, Transported = 1
- Lower values means negative correlation - if varible increase, Transported = 0
- 0 means no correlation (Person)

In [None]:
def corr_mat(data, annot = True):
# Compute the correlation matrix

    corr = data.corr()

    # Generate a mask for the upper triangle
    mask = np.triu(np.ones_like(corr, dtype=bool))

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(15, 9))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(230, 20, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap='coolwarm',vmin=-1, vmax=1, center=0,
    square=True, linewidths=1, cbar_kws={"shrink": .7}, annot = annot)

corr_mat(train_df)

In [None]:
# sns.pairplot(train_df[var])

# Feature Engineering
- In this section, we will create different set of variables applying the transformations  in the numeric set, as seen previously. Afterwards, this set is going to be teste separately in the model.
- Also, categorical variables are going to betransformed into dummies (0 or 1) or econded with one hot enconding process (0,0,1,...)

# Inputing Variables

In [None]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')

def inputer(data, kind = 'median'):
    print(f'inputing null values with {kind}')
    for i in data.columns:
        value = 0 
        
        if kind == 'media':
            value = data[i].median()
        elif kind == 'mean':
            value = data[i].mean()

        data[i] = data[i].fillna(value)
        
    return data



In [None]:
#All feature engineering 
def feat_eng(data, train =False):
    #imputing null values 
    data = inputer(data, kind =CFG.imputer )
    
    sub = data[['PassengerId','Name']]
    
#     remove = ['']
#     data = data.drop(remove, axis = 1)
    
   #Variables Transformations
    tf = {'_b':[],'_g':[],'_q':[]}
    var  = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for k,j in tf.items():
        for v in var:
            j.append((v+k))

#   Applying some transformations in the var variables list          
    transform_gauss = PowerTransformer()
    transform_qt = QuantileTransformer()
    
    data[tf['_b']] = np.vectorize(binarization)(data[var])
    data[tf['_g']] = transform_gauss.fit_transform(data[var])
    data[tf['_q']] = transform_qt.fit_transform(data[var])
    
    tf['_ori'] = var
    tf['_All'] = [ j for k, i in tf.items() for j in i]
    
    
    
    # Creating categorical age
    data['Age_d'] = pd.cut(data['Age'],bins =[0,10,18,100],
                               labels=[ "child", "teen","adult"])
    #transform into dummies categorical variables
    data = pd.get_dummies(data,columns = ['HomePlanet','CryoSleep','Destination','VIP'],dummy_na=False, prefix_sep = '_D_', drop_first=True)
    data = pd.get_dummies(data,columns = ['Age_d'],dummy_na=False)
    
    
    #Spling cabin letter and applying label enconding
    data['Cabin_enc'] = data['Cabin'].apply(lambda x: str(x).split('/')[0])
    le = preprocessing.LabelEncoder()
    data['Cabin_enc'] = le.fit_transform(data['Cabin_enc'])
    
    #Drops
    data = data.drop((['Age','Cabin']), axis = 1 )
    data = data.drop(['PassengerId','Name'], axis = 1)
    if train:
        data['Transported'] = (data['Transported']).apply(int)
            
    
#   Return the data transformed (data), the passengerId (sub for submission) and the dicitionarie of vairables transformed ( to select in the model section)     
    return data,sub, tf

train_df_a,_, tf = feat_eng(train_df,train = True)

In [None]:
train_df_a.head()

In [None]:
print(f'Total of variables:{ train_df_a.shape[1]}')

In [None]:
#Checking the new variables correlations
corr_mat(train_df_a,False)

In [None]:
#Checking the correlation of all variables to the transported (target) (Zoom in)
corr = train_df_a.corr()['Transported']
corr = corr.reset_index()
plt.figure(figsize = (12,10))
sns.barplot(y = 'index', x = 'Transported',data = corr.sort_values('Transported',ascending = False)[1:], palette = 'magma')
plt.title('Variables Correlation to the Transported Target')

In [None]:
# Creating a top most correlataed variables set for training (abs)
corr['Transported'] = np.vectorize(abs)(corr['Transported'])
tf['_topVars'] =  list(corr.sort_values('Transported', ascending = False)[1:(CFG.top_vars+1)]['index'].values)

# Data Augmentation
- Experimenting with a  tabular data augmentation from the project found in Git: https://github.com/lschmiddey/deep_tabular_augmentation/tree/main/deep_tabular_augmentation

In [None]:
X = train_df_a.drop('Transported', axis = 1)
y = train_df_a['Transported']

train_df_aug = train_df_a.copy()
for i in [0,1]:

    X_train, X_val, y_train, y_val = train_test_split(X,y, 
                                                  test_size = CFG.val_ratio, 
                                                  random_state = CFG.random_state,
                                                  stratify =y)
    
    # from https://github.com/lschmiddey/deep_tabular_augmentation/tree/main/deep_tabular_augmentation
    target_class = i
    x_scaler = StandardScaler()

    X_train_scaled = x_scaler.fit_transform(X_train)

    X_val_scaled = x_scaler.transform(X_val)

    X_train = X_train_scaled[np.where(y_train==target_class)[0]]
    X_val = X_val_scaled[np.where(y_val==target_class)[0]]

    y_train = y_train.values[np.where(y_train==target_class)[0]]
    y_val = y_val.values[np.where(y_val==target_class)[0]]

    datasets = dta.create_datasets(X_train, y_train, X_val, y_val)
    data = dta.DataBunch(*dta.create_loaders(datasets, bs=1024))

    D_in = X_train.shape[1]
    VAE_arch = [27,20,20]
    target_name = 'Transported'

    df_cols = list(X.columns)
    device = 'cpu'
    model = dta.Autoencoder(D_in, VAE_arch, latent_dim =3)
    opt = torch.optim.Adam(model.parameters(), lr=0.01)
    loss_func = dta.customLoss()

    learn = dta.Learner(model, opt, loss_func, data, target_name, target_class, df_cols)

    run = dta.Runner(cb_funcs=[dta.LR_Find, dta.Recorder])

    run.fit(CFG.epochs, learn)

    # run.recorder.plot(skip_last=5)

    sched = dta.combine_scheds([0.3, 0.7], [dta.sched_cos(0.01, 0.1), dta.sched_cos(0.1, 0.01)])

    cbfs = [partial(dta.LossTracker, show_every=50), dta.Recorder, partial(dta.ParamScheduler, 'lr', sched)]
    model = dta.Autoencoder(D_in, VAE_arch, latent_dim=20).to(device)
    opt = torch.optim.Adam(model.parameters(), lr=0.01)
    learn = dta.Learner(model, opt, loss_func, data, target_name, target_class, df_cols)
    run = dta.Runner(cb_funcs=cbfs)
    run.fit(CFG.epochs, learn)
    
    difference_in_class_occurences = int((CFG.increase/2))
    df_fake = run.predict_df(learn, no_samples=difference_in_class_occurences, scaler=x_scaler)
    std_list = list(train_df_a[train_df_a['Transported']==1][df_cols].std()/10)
    df_fake_with_noise = run.predict_with_noise_df(learn, no_samples=difference_in_class_occurences, mu=0, sigma=std_list, scaler=x_scaler)
    
    x_aug = x_scaler.inverse_transform(df_fake_with_noise.drop(['Transported'],axis =1 ))
    x_aug = pd.DataFrame(x_aug, columns = df_cols)
    x_aug['Transported'] = df_fake_with_noise['Transported']
    
    train_df_aug = pd.concat([train_df_aug, x_aug])

In [None]:
train_df_aug = train_df_aug.reset_index().drop('index',axis =1)
train_df_aug.shape

In [None]:
# sns.pairplot(train_df_aug.reset_index().drop('index',axis =1)[['RoomService', 'FoodCourt']])

# Model
- For model training, we are going to use a stratifyed K fold validaditon and a hyper parameter pre-tuning

In [None]:

def training_cv(model,X, y, split = CFG.folds):

    kf = StratifiedKFold(n_splits=split, shuffle = True, random_state = CFG.random_state)
    acc_best = 0
    acc_list = []
    rec_list = []
    auc_list = []
    prec_list = []
    best_model = 0
    
    for train_, val_df in kf.split(X,y):
        X_train, X_val = X.iloc[train_], X.iloc[val_df]
        y_train, y_val = y.iloc[train_], y.iloc[val_df]

        model.fit(X_train,y_train)
        pred = model.predict(X_val)
        
        
        acc = accuracy_score(y_val,pred)
        prec = precision_score(y_val,pred)
        rec = recall_score(y_val,pred)
        auc = roc_auc_score(y_val,pred)
        
        print(round(accuracy_score(y_train,model.predict(X_train)),3),round(acc,3) )
        
        acc_list.append(acc)
        prec_list.append(prec)
        rec_list.append(rec)
        auc_list.append(auc)
        
        
        if acc > acc_best:
            best_model = copy.deepcopy(model)
            acc_best = acc
            
    results = {'acc':acc_list,'precision':prec_list,'recall':rec_list,'auc':auc_list}
    return best_model, results

In [None]:
#Tuning models
def tuning_models(models, parameters, X, y):
#     models = { i:k for i,k in models.items() if i != 'Ensemble'}
    tuning_results = {}
    for name, m in models.items():
        print(f'>Tuning:{name}')
        clf = RandomizedSearchCV(m, parameters[name], random_state=CFG.random_state)
        search = clf.fit(X.values, y,)
        tuning_results[name] = {'best_model':search.best_estimator_}
        
    return tuning_results

In [None]:
# Shap model for model explainability
def SHAP_(model, X):
    
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    
    return  explainer, shap_values

In [None]:
#Data
X = train_df_aug.drop('Transported', axis = 1)
y = train_df_aug['Transported']



# # spliting train validation set
# X_train, X_val, y_train, y_val = train_test_split(X,y, 
#                                                   test_size = CFG.val_ratio, 
#                                                   random_state = CFG.random_state,
#                                                   stratify =y)


# MODELS parameters for tunning
params = {'depth':[3,1,2,6,4,5,7,8,9,10,12,13],
          'iterations':[100,250,500, 700,1000],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3,0.5], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200]}


#Parameters to tunning
parameters = {'RandomForest':{'max_depth':[2,3,4,5,10,20,30,50],'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True),'n_estimators':[10,20,50,75,100,200,400]} ,
          'DecisionTree': {'max_depth':[3,5,10,20],'min_samples_leaf' : np.linspace(0.1, 0.5, 5, endpoint=True)} ,
          'SVC': {'kernel':['linear', 'rbf', 'poly']},
          'LogisticRegression':{'C': uniform(loc=0, scale=4)},
           'CatBoost':params } 

tf_vars = list(tf.values())
base_vars = [i for i in train_df_a.columns if i not in [j for i in tf.values() for j in i]]
base_vars.remove('Transported')

results    = {}
best_model = {}
explainer_l  = {}
shap_values_l = {}
#Testing different sets of variables
for i in tf.keys():
# for i in ['_g']:
    if i == '_topVars':
        var  = tf[i]
        X_ = X[var].copy()
    else:
        var = base_vars + tf[i]
        X_ = X[var].copy()
    
#     models = {'CatBoost':CatBoostClassifier(verbose = False)}
    
    #       Other models to test  
    models = {'RandomForest': RandomForestClassifier(random_state = CFG.random_state,oob_score = True),
              'DecisionTree': DecisionTreeClassifier(random_state = CFG.random_state),
               'CatBoost':CatBoostClassifier(verbose = False)}


    
    # Tuning
    tuning_results = tuning_models(models, parameters, X_, y) 
    models = {model:d['best_model'] for model, d in tuning_results.items()}
    
    #Adding ensemble
#     vclf = VotingClassifier(estimators = [(model,d['best_model']) for model, d in tuning_results.items() ],
#                         voting = 'soft')

#     models = dict(models, **{'Ensemble': vclf})

    #Training
    for name, model in models.items():
        print(f'>Evaluating Model: {(name+i)}')
        best_model[(name+i)], metrics = training_cv(model, X_, y)
        results[(name+i)] = {'best_model':best_model, 'accuracy': metrics['acc'],'recall': metrics['recall'],'precision': metrics['precision'], 'auc': metrics['auc']}
        explainer_l[(name+i)], shap_values_l[(name+i)] = SHAP_(best_model[(name+i)], X_)
        
#generate results table    
data_results = pd.DataFrame.from_dict(results, orient='index',).reset_index().rename(columns={'index':'models'}).explode(['accuracy','recall','precision','auc'])
data_table = pd.melt(data_results,id_vars = ['models'], value_vars = ['accuracy','recall','precision','auc'],var_name='metric')
print('End')


# Results Analysis

In [None]:
#Generating table of results
df = data_results.drop('best_model', axis = 1).groupby('models').agg(['mean', 'median'])
df.columns = ['_'.join(col) for col in df.columns]

In [None]:
# Printing comparision of results
mean_data = data_results.drop('best_model', axis = 1).groupby('models').mean()

mean_data = data_results.drop('best_model', axis = 1).groupby('models').agg(['mean', 'median'])
mean_data.columns = ['_'.join(col) for col in mean_data.columns]

for i in mean_data.columns:
    metric = mean_data[i].max()
    model = mean_data[mean_data[i] == metric ].index[0]
    
    print(f'best {i}: {model} \ {metric}')

In [None]:
#Ploting mean and median of results
fig, ax = plt.subplots(figsize = (15,8))
mean_data.plot(kind = 'bar', ax = ax)

In [None]:
#Boxplot of metrics
plt.subplots(figsize = (20,8))
sns.boxplot(x = 'models',y = 'value', data = data_table, hue = 'metric')
plt.ylabel('Metrics',fontdict ={'size':16})
plt.xlabel('Models',fontdict ={'size':16})
plt.title('Comparison',fontdict ={'size':16})
plt.tight_layout()

# Analysis - CatBoost

In [None]:
#Feature importance of the variables in all models
fig, ax = plt.subplots(nrows = 6, figsize =(10,25))
for f, i in enumerate(tf.keys()):
    
    if i == '_topVars':
        var  = tf[i]
    else:
        var = base_vars + tf[i]
    
    ax[f].set_title(('CatBoost'+i))
    sns.barplot(x = best_model[('CatBoost'+i)].feature_importances_, y = X[var].columns, ax = ax[f])

## SHAP Values Analysis - CatBoost

In [None]:
# Shap values of variables in the model 
for v, m in zip(tf.keys(), ['CatBoost_b','CatBoost_g','CatBoost_q','CatBoost_ori','CatBoost_All', 'CatBoost_topVars']):
    if v == '_topVars':
        var  = tf[v]
    else:
        var = base_vars + tf[v]
    

    X_ = X[var]
    plt.title(m)
    shap.summary_plot(shap_values_l[m], X_)

# Submission

In [None]:
# Get best model based on accuracy
sub_model = mean_data[mean_data[CFG.selection] == mean_data[CFG.selection].max() ].index[0]

In [None]:
(print(sub_model))

In [None]:
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

test_df_a, id_df, tf = feat_eng(test_df)

i = '_'+sub_model.split('_')[1]

if i == '_topVars':
    var  = tf[i]
else:
    var = base_vars + tf[i]

test_df_a = test_df_a[var]

pred = best_model[sub_model].predict(test_df_a)

pred = (pred ==1) 
id_df['Transported'] = pred
id_df[['PassengerId','Transported']].to_csv('./submission.csv', index = False)

In [None]:
id_df.head(20)