### Problem Statement:-
In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.

###  Notebooks that inspired me:-
* https://www.kaggle.com/code/samuelcortinhas/spaceship-titanic-a-complete-guide
* https://www.kaggle.com/code/startupsci/titanic-data-science-solutions/comments

### Importing libraries

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# To supress warnings
import warnings
warnings.filterwarnings('ignore')
import time

# Data Viz
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # Visualizing missing values
%matplotlib inline

# Data Preprocessing  
from sklearn.impute import KNNImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV

# Model building 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
# Model validation
from sklearn.metrics import classification_report,accuracy_score,roc_auc_score

# Saving model
import pickle

### UDF created for EDA

In [None]:
# UDF to summary for continuous variables

def continuous_var_summary( x ):
    
    # freq and missings
    n_total = x.shape[0]
    n_miss = x.isna().sum()
    perc_miss = n_miss * 100 / n_total
    
    # outliers - iqr
    q1 = x.quantile(0.25)
    q3 = x.quantile(0.75)
    iqr = q3 - q1
    lc_iqr = q1 - 1.5 * iqr
    uc_iqr = q3 + 1.5 * iqr
    
    return pd.Series( [ x.dtype, x.nunique(), n_total, x.count(), n_miss, perc_miss,
                       x.sum(), x.mean(), x.std(), x.var(), 
                       lc_iqr, uc_iqr, 
                       x.min(), x.quantile(0.01), x.quantile(0.05), x.quantile(0.10), 
                       x.quantile(0.25), x.quantile(0.5), x.quantile(0.75), 
                       x.quantile(0.90), x.quantile(0.95), x.quantile(0.99), x.max() ], 
                     
                    index = ['dtype', 'cardinality', 'n_tot', 'n', 'nmiss', 'perc_miss',
                             'sum', 'mean', 'std', 'var',
                        'lc_iqr', 'uc_iqr',
                        'min', 'p1', 'p5', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95', 'p99', 'max'])

# Create Summary for categorical variables
def categorical_var_summary(x):
    Mode = x.value_counts().sort_values(ascending = False)[0:1].reset_index()
    return pd.Series([x.count(), x.isnull().sum(),x.isnull().sum()*100/x.shape[0], Mode.iloc[0, 0], Mode.iloc[0, 1], 
                          round(Mode.iloc[0, 1] * 100/x.count(), 2)], 
                  index = ['N', 'NMISS','Perc_MISS' ,'MODE', 'FREQ', 'PERCENT'])

# UDF for boxplot
def boxplot_fn(x):
    plt.title('Boxplot of '+x.name)
    sns.boxplot(x)
    plt.show()
    
# UDF for Frequency plot
def fn_create_freq_plot(x):
    plt.figure(figsize=(7,4))
    plt.title('Frequency plot for ' + x.name )
    x.value_counts().plot(kind='bar',edgecolor='tomato',color='steelblue')
    plt.show()

# Workflow:-

### 1.Acquiring data 

### 2.Data understanding /Data Audit

   * **At Overall level:-**
        * Sample or population:-
            * If it is sample, do we have population metrics to compare the sample metrics
              with population metrics.
            * Size of the sample
            
    * Dimension - No of rows and columns
    * File size
    * Check for Presence of missing values
    * Check for Presence of duplicates
    
      
* **At Feature Level:-**
*      Feature wise EDA
        * Check for Outliers
        *  Improper feature name
        *  Features with multiple values - do we need to split/extract specific 
           information from the variable?
        * Feature containing special values-0's, @, ?, #NA, #N/A, #Error, Currencies, Null values, -inf, inf, 99999
*       Detailed EDA according to data type:-
        *            Numerical Features:-
                    * Overall summary - Mean,median,IQR etc.
                    * Boxplot 
                    * Histogram
                    * Correlation plot
                    * Individual feature exploration
      *                    
                     Categorical Features:-
                       * Cardinality - Number of unique values
                       * Frequency Plot 
                       * Individual feature exploration
                       * Relation with dependent variable
                       * Identify variables that require encoding (Label encoding/One hot encoding)
                    
### 3.Data Preprocessing
*  **Handling all the problems identified in the data audit report**
    * Converting data types into appropriate manner
    * Renaming variables as required (specially remove spaces, special characters from headers)
    * Imputation of Missing values:-
        * Numerical - Impute with mean/median/Regression/KNN
        * Categorical - Impute with mode/Regression/KNN
    * Handling Outliers
        * Capping & floring with upper cap/lower cap
            * P1, P99 or P5, P95
            * mean +/- 3std
            * Q1-1.5*IQR, Q3+1.5*IQR
    *  Converting categorical variables into numercal variables
          * Ordinal variables (Label encoding)
          * Nominal Variables (One-Hot encoding)
         * Creating derived variables (Using KPI's/Using Date variabels/Split the existing 
           Variables etc)

### 4.Data Preparation-4
   * Splitting the data into train & Test/Validation:-
        *  Split size can be any of following- (70:30),(80-20),(50-50)
   * Standardizing(**Must for distance based algorithm like KNN**)
       * Standardize data to make data consistent.
       * can use min-max scaler,standard scaler,robust scaler,etc. 

### 5.Building Model   
 * Machine Learning models:-
     - Find the right hyperparameters to tune using GridsearchCV
     - Perform Gridsearch
     - Identify best model with best parameters
     - Finalize the model
     
### 6. Model Validation
  * Score the data using finalized model on train data. 
  *  Predicting values for test data.
  *  Calculate all the metrics related to different types of techniques on validation data.
  *  Check the problems related to Overfitting/Underfitting.
  *  Finalize model based on low overfitting & low underfitting.

### Importing Data

In [None]:
# Train data
df_train= pd.read_csv('../input/spaceship-titanic/train.csv')
print("Train Data",df_train.shape)

# Test data
df_test = pd.read_csv('../input/spaceship-titanic/test.csv')
print("Test Data",df_test.shape)

# Defining Y variable for test data
#df_test['Transported'] = np.random.randint(1,3)  # Filling with random values


In [None]:
# Train Data preview 
pd.concat([df_train.head(3),df_train.tail(3)])

In [None]:
# Test Data preview 
pd.concat([df_test.head(3),df_test.tail(3)])

# Data Audit

In [None]:
# Features available
df_train.columns 
# Transported is Dependent/Target variable

In [None]:
# Basic Info about features,i.e.,dtypes,missing values
df_train.info()
print('_'*40)
df_test.info()

In [None]:
# Checking for presence of duplicates
print('Duplicates in Train data',df_train.duplicated().sum())
print('Duplicates in Test data',df_test.duplicated().sum())

In [None]:
# Checking for presence of missing values
print('Missing values in Train data','\n',df_train.isnull().sum())
print("-"*40)
print('Missing values in Test data','\n',df_test.isnull().sum())

##  Dividing data based on data types

#### There are 6 Numerical features and 8 Categorical features.
> Dividing data into numerical and categorical can help us understand data better.

In [None]:
Numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

Categorical_features = ['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination','Name', 'Transported','VIP']

## Missing values
Features containing missing values:-*HomePlanet, CryoSleep, Cabin  ,Destination, Age ,VIP ,RoomService ,FoodCourt,ShoppingMall,Spa, VRDeck ,Name contains missing values.* 
> Can Visualize missing values for better understanding.

In [None]:
# Train data
msno.bar(df_train,sort='ascending',color='steelblue')
plt.show()

In [None]:
# Percentage missing for train data
df_train[Categorical_features].apply(categorical_var_summary).T['Perc_MISS']\
.plot(kind='bar',figsize=(8,4))
plt.show()

In [None]:
# Test data
msno.bar(df_test,sort='ascending',color='tomato')
plt.show()

In [None]:
# Percentage missing for test data
df_train[Numerical_features].apply(continuous_var_summary).T['perc_miss']\
.plot(kind='bar',color='tomato',figsize=(8,4))
plt.show()

### Observations:-
* For both train and test data, percentage missing is nearly around **2%** in each feature.
* No missing values present of *PassengerID* and *Transported* feature.

#  Feature wise EDA

### Numerical features
> Age, RoomService ,FoodCourt ,ShoppingMall ,Spa ,VRDeck
    

In [None]:
# Numerical variable summary
df_train[Numerical_features].apply(continuous_var_summary)

In [None]:
# Boxplot for numerical features
df_train[Numerical_features].apply(boxplot_fn)
plt.show()

In [None]:
# Distribution of Numerical features
df_train[Numerical_features].hist(bins=20, figsize=(20,20),edgecolor='black',color='lightgreen',grid=False)
plt.show()

### Observations:-
* Boxplot shows presence of outliers in every numerical feature.
* With exception for Age feature,outliers are extreme in nature.
* Except for Age feature every other feature shows extreme right skewness.

In [None]:
# Correlation Plot
plt.figure(figsize=(10,7))
sns.heatmap(df_train[Numerical_features].corr(),annot=True,cmap='Oranges')
plt.show()

###  ***Age***

In [None]:
# Distribution
sns.histplot(df_train.Age)
plt.show()

In [None]:
# Age VS Transported
temp = sns.FacetGrid(data=df_train,col='Transported')
temp.map(plt.hist,'Age',bins=20,edgecolor='black',color='tomato')
plt.show()

In [None]:
# Age catgeory VS Transported  

# Creating Age Catgeory feature
df_train['Age_cat'] = pd.cut(df_train.Age,[0,14,25,45,60,df_train.Age.max()])

plt.figure(figsize=(8,5))
sns.heatmap(pd.crosstab(columns=df_train.Age_cat,index=df_train.Transported,normalize='columns')\
            .round(3).mul(100),annot=True,cmap='Blues')
plt.show()

#### Observations:-
*  For the younger ages i.e.,(0-14),there exists higher liklehood of being transported.

*  For the rest i.e.(>14),there exists near equal chances for being transported.

### Categorical features:-
>PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Name,Transported,VIP'

In [None]:
# Cardinality of Categorical features
df_train[Categorical_features].nunique()

### Observations:-
* Features like passegnerID,Cabin and Name have very high cardinality.
* Either we can derive some important information from them or drop these features,as features with high variance play no role in model building.


In [None]:
# Frequency plot for categorical features
df_train.loc[:,['HomePlanet','CryoSleep','Destination','Transported','VIP']].apply(fn_create_freq_plot)
plt.show()

In [None]:
# Categorcial variable summary
df_train[Categorical_features].apply(categorical_var_summary)

###  ***Transported***

In [None]:
# Percentage Composition

ax=df_train.Transported.value_counts(normalize=True).mul(100).round(1)\
    .plot(kind='bar',color=('steelblue','tomato'),figsize=(8,4),xlabel ='Transported',ylabel='%Comp')

ax.bar_label(ax.containers[0], label_type='edge')
ax.margins(y=0.1)
plt.show()

###  ***HomePlanet***

In [None]:
# Percentage Composition

ax=df_train.HomePlanet.value_counts(normalize=True).mul(100).round(1)\
    .plot(kind='bar',color=('steelblue','tomato'),figsize=(8,4),xlabel ='HomePlanet',ylabel='% Comp')

ax.bar_label(ax.containers[0], label_type='edge')
ax.margins(y=0.1)
plt.show()

In [None]:
# Homeplanet VS Transported
df_train[['HomePlanet','Transported']].groupby('HomePlanet')[['Transported']].mean()

### Observations:-
Person belonging  to *Europa* has higher chances(~ 66%) of being transported,followed by
*Mars*(~ 52%) and at last *Earth* (~ 42%).

> Can perform **ordinal encoding** to give higher preference to Europa than others.

###  ***Destination***

In [None]:
# Percentage Composition

ax=df_train.Destination.value_counts(normalize=True).mul(100).round(1)\
    .plot(kind='bar',color=('steelblue','tomato'),figsize=(8,4),xlabel ='Destination',ylabel='% Comp')

ax.bar_label(ax.containers[0], label_type='edge')
ax.margins(y=0.1)
plt.xticks(rotation=0)
plt.show()

In [None]:
# Destination VS Transported
df_train[['Destination','Transported']].groupby('Destination')[['Transported']].mean()

### Observations:-
Person Travelling to  *55 Cancri e* has higher chances(~ 61%) of being transported,followed by
PSO J318.5-22(~ 50%) and at last TRAPPIST-1e (~ 47%).
> Can **group** people travelling to *PSO J318.5-22* and *TRAPPIST-1e* together ,as they near equal chances of bein transported and give **higher preference** to *55 Cancri e*

###  ***VIP***

In [None]:
# Percentage Composition

ax=df_train.VIP.value_counts(normalize=True).mul(100).round(1)\
    .plot(kind='bar',color=('steelblue','tomato'),figsize=(8,4),xlabel ='VIP',ylabel='% Comp')

ax.bar_label(ax.containers[0], label_type='edge')
ax.margins(y=0.1)
plt.show()
# Select people opt for VIP service,could be very expensive for the traveller.

In [None]:
# VIP VS Transported
df_train[['VIP','Transported']].groupby('VIP')[['Transported']].mean()

### Observations:-
People opting for VIP services,has lesser chances of being transported as compared to people not opting of VIP services

> VIP service could be ensuring safer voyage by giving them better cabins.

###  ***CryoSleep***

In [None]:
# Percentage Composition

ax=df_train.CryoSleep.value_counts(normalize=True).mul(100).round(1)\
    .plot(kind='bar',color=('steelblue','tomato'),figsize=(8,4),xlabel ='CryoSleep',ylabel='% Comp')

ax.bar_label(ax.containers[0], label_type='edge')
ax.margins(y=0.1)
plt.show()

In [None]:
# CryoSleep VS Transported
df_train[['CryoSleep','Transported']].groupby('CryoSleep')[['Transported']].mean()

#### Observations
* Few people(only 36%) opt for the option to be in cryosleep.

* Being in Cryosleep increases the person's likelihood of being transported i.e.,nearly **82%** of people in cryosleep got transported.



In [None]:
# Cryosleep VS  Numerical features 
for i in Numerical_features[1:]:
    temp = sns.FacetGrid(data=df_train,col='CryoSleep')
    temp.map(plt.hist,i,bins=20,edgecolor='black',color='tomato')
    plt.show()

#### Observations
People who are in cryosleep do not uses any kind of servives like spa,foodcourt,roomservice,spa,VRDeck.

###  ***PassengerID***
   *In passengerID, **last 2 digit** denotes the size of the group in which the traveller belongs*

In [None]:
# Creating Group Size fetaure

df_train['group_size'] = df_train.PassengerId.apply(lambda x:x.split('_')[1])

# Impact of Group size on being transported
df_train[['group_size','Transported']].groupby('group_size')[['Transported']].mean()

#### Observations:-

Only the group size of 3 and 4 slightly increases the chances of being 
transported,other than that group size does not improves the chances of being 
transported.

> Instead of comparing different group sizes,can see the impact of travelling alone vs travelling in a group



In [None]:
# Feature to check if the traveller is in a group
df_train['is_group'] = np.where(df_train.group_size=='01',1,0)

# Composition
df_train['is_group'].value_counts(normalize=True)
# Majority(>70%) of travellers prefer to travel solo

#### Solo vs Group travellers

In [None]:
# Impact of being in a group vs solo on being transported
df_train[['is_group','Transported']].groupby('is_group')[['Transported']].mean()

#### Observations:-
Not being in a group or travelling solo, increases the likelihood of being transported.


###  ***Cabin***: 
Takes the form **deck/num/side**

    

In [None]:
# Cabin deck
df_train['cabin_deck'] = df_train.Cabin.dropna().apply(lambda x:x.split('/')[0])


# Impact of cabin_deck on being transported
df_train[['cabin_deck','Transported']].groupby('cabin_deck')[['Transported']].mean()


#### Observations:-
* Deck *T and E*- lowest % of people being transported.
* Deck *B and C* - highest % of people being transported.
* Deck *A,D,F and G* - nearly equal % of people being transported.
> Can give higher preference via **Ordinal encoding** to deck *B and C* as compared to rest.

In [None]:
# Cabin num
df_train['cabin_num'] = df_train.Cabin.dropna().apply(lambda x:x.split('/')[1])

print('No of Unique Cabin Number Values:',df_train['cabin_num'].nunique()) 

# High amount of cabin number present,no significant impact on target feature.
# dropping this feature
df_train.drop(columns=['cabin_num'],inplace=True)

In [None]:
# Cabin side (P for port and S for Starboard)

df_train['cabin_side'] = df_train.Cabin.dropna().apply(lambda x:x.split('/')[2])

# Impact of cabin side on being transported
df_train[['cabin_side','Transported']].groupby('cabin_side')[['Transported']].mean()


#### Observations:-
*  Being in *Starboard* side **increases** the likelihood of getting transported
 
*  Around **55%** traveller in *Starboard* side got transported ,ie,**10% more** than *Port* side.
>  Not a significant difference ,but shows which side is better in the event of any disaster.

In [None]:
# Creating cabin and group size features for test data.

# Creating  group_size feature for test data
df_test['group_size'] = df_test.PassengerId.apply(lambda x:x.split('_')[1])

# creating is_group for test data
df_test['is_group']=np.where(df_test.group_size=='01',1,0)

# creating cabin deck for test data
df_test['cabin_deck'] = df_test.Cabin.dropna().apply(lambda x:x.split('/')[0])

# creating cabin side for test data
df_test['cabin_side'] = df_test.Cabin.dropna().apply(lambda x:x.split('/')[2])


# Data Preprocessing

In [None]:
# Adding newly created features in categorical data
Categorical_features = Categorical_features + ['is_group','cabin_deck', 'cabin_side','group_size']

In [None]:
# Combining train and test data

# Combining both train and test save us the effort of applying the transformations that has been
# performed on train data to test data.

# Creating a column for the purpose of separating the train and test data for modelling.
df_train['IS_TRAIN'] = True
df_test['IS_TRAIN'] = False

main_df = pd.concat([df_train,df_test],ignore_index=True)

In [None]:
# Missing Value treatment

# 1. Numerical Features- KNN imputation 
# Intialising the imputer
imputer = KNNImputer()
# Fitting on complete data
main_df[Numerical_features]= pd.DataFrame(imputer.fit_transform(main_df[Numerical_features]),
                                          columns=imputer.feature_names_in_)


# 2.Categorical features - Mode value imputation 
main_df[Categorical_features] = main_df[Categorical_features].apply(lambda x:x.fillna(x.mode()[0]))


In [None]:
# Outlier Treatment

# Clipping the values at 1 and 99 percentile i.e.,taking care of extreme outliers
main_df[Numerical_features] = main_df[Numerical_features].apply(lambda x:x.clip(lower=x.quantile(0.01),upper=x.quantile(0.99)))

In [None]:
# Encoding Categorical features

# Homeplanet
# --Europa > Mars > Earth--
main_df['HomePlanet'] = main_df['HomePlanet'].map({'Europa':3,'Mars':2,'Earth':1})

# Cryosleep
main_df['CryoSleep'] = main_df['CryoSleep'].map({True:1,False:0}) 

# VIP
main_df['VIP'] = main_df['VIP'].map({True:1,False:0}) 

# Transported
main_df['Transported'] = main_df['Transported'].map({True:1,False:0}) 

# Destination
# --55 Cancri e > TRAPPIST-1e = PSO J318.5-22 --
main_df['Destination'] = main_df['Destination'].map({'TRAPPIST-1e':1,
                                                     '55 Cancri e':2,
                                                     'PSO J318.5-22':1 }) 

# Cabin Side
main_df['cabin_side'] = main_df['cabin_side'].map({'S':2,'P':1})

# Cabin deck 
main_df.cabin_deck = pd.Series(np.where(main_df.cabin_deck.isin(['B','C']),3,
                                np.where(main_df.cabin_deck.isin(['T','E']),1,2)))

# Dropping features of no importance
main_df.drop(columns=['PassengerId','Cabin','Age_cat','group_size','Name'],inplace=True)


In [None]:
# Separating Train and test data

# Train
df_train = main_df[main_df.IS_TRAIN==True]
df_train.drop(columns=['IS_TRAIN'],inplace=True)

# Test
df_test = main_df[main_df.IS_TRAIN==False]
df_test.drop(columns=['IS_TRAIN'],inplace=True)

In [None]:
# Train - Validation data split 

X = df_train[df_train.columns.difference(['Transported'])]
y =df_train['Transported']

train_x,val_x,train_y,val_y = train_test_split(X,y,test_size=0.3,random_state=123)

In [None]:
# Initialising std scaler
std = StandardScaler()
# Fitting on train data
std_train = std.fit(train_x)
# Transforming on train data
train_x_std = pd.DataFrame(std_train.transform(train_x),columns=train_x.columns)
# Transforming on validation data 
val_x_std = pd.DataFrame(std_train.transform(val_x),columns=val_x.columns)

In [None]:
# Train - Validation data split 

X = df_train[df_train.columns.difference(['Transported'])]
y =df_train['Transported']

train_x,val_x,train_y,val_y = train_test_split(X,y,stratify=y,test_size=0.2,random_state=123)


# Initialising std scaler
std = StandardScaler()
# Fitting on train data
std_train = std.fit(train_x)
# Transforming on train data
train_x_std = pd.DataFrame(std_train.transform(train_x),columns=train_x.columns)
# Transforming on validation data 
val_x_std = pd.DataFrame(std_train.transform(val_x),columns=val_x.columns)

# Model building
**Algorithms used:**
* Logistic Regression
* Decision Tree Classifier
* Random Forest Classifier
* GBM Classifier
* XGBoost Classifier
* Light GBM classifier
* Catboost classifier
* KNN Classifier
* Support Vector Classifier (Linear SVM, Kernal SVM)
* Naive Bayes Classifier

In [None]:
classification_models={'Logistic Regression':LogisticRegression(random_state=123),
                       'Decision Tree Classifier':DecisionTreeClassifier(random_state=123),
                        'Random Forest Classifier':RandomForestClassifier(random_state=123),
                        'GBM Classifier':GradientBoostingClassifier(random_state=123),
                        'XGBoost Classifier':XGBClassifier(eval_metric='logloss',random_state=123),
                        'Light GBM classifier':LGBMClassifier(random_state=123),
                        'Catboost classifier':CatBoostClassifier(verbose=0,random_state=123),
                        'KNN Classifier':KNeighborsClassifier(),
                        'Support Vector Classifier':SVC(class_weight = 'balanced', gamma = 'auto',random_state=123 ),
                        'Naive Bayes Classifier':GaussianNB()}

model_summ = pd.DataFrame({'Model_Name':np.zeros(len(classification_models)),
                               'Validation_metric':np.zeros(len(classification_models)),
                               'Train_Accuracy':np.zeros(len(classification_models)),
                               'Test/Val_Accuracy':np.zeros(len(classification_models)),
                               'Training_time':np.zeros(len(classification_models))})
voting = []
def execute_model(train_X,train_y,val_x,val_y,val_metric='accuracy'):
    i=0
    for keys,models in classification_models.items():
        # Name of the model
        print(keys)
        start = time.time()
        # Fitting the model
        Model_fit = models.fit(train_X,train_y)
        end = time.time()
        model_summ.loc[i,'Model_Name'] = keys
        model_summ.loc[i,'Validation_metric'] = val_metric
        model_summ.loc[i,'Train_Accuracy'] = accuracy_score(train_y,Model_fit.predict(train_X))
        model_summ.loc[i,'Test/Val_Accuracy'] = accuracy_score(val_y,Model_fit.predict(val_x))
        model_summ.loc[i,'Training_time'] = np.round((end-start)/60,2)
        voting.append(Model_fit)
        i=i+1
        print('Train Accuracy-',accuracy_score(train_y,Model_fit.predict(train_X)))
        print('Validation Accuracy-',accuracy_score(val_y,Model_fit.predict(val_x)))
        print('Execution Time-',np.round((end-start)/60,2),'minutes')
        print("-"*50)


In [None]:
execute_model(train_X=train_x_std,train_y=train_y,val_x=val_x_std,val_y=val_y)

In [None]:
#  Accuracy and training time for each model
model_summ.sort_values(by='Test/Val_Accuracy',ascending=False)

In [None]:
# For top 5 models ,Performing 10 fold  crossvalidation and hyperparameter tuning

top_5_class = { 'XGBoost Classifier':XGBClassifier(eval_metric='logloss',ascending=om_state=123) ,
                'GBM Classifier':GradientBoostingClassifier(),
                'Light GBM classifier':LGBMClassifier(),
                'Catboost classifier':CatBoostClassifier(verbose=0),
                'Support Vector Classifier':SVC(class_weight = 'balanced', gamma = 'auto',probability=True )}

# Hyperparameter tuning:-

param_XGB = {'n_estimators': [50, 100, 200],'learning_rate': [0.01,0.1,1]}

para_LGBM = {'n_estimators':range(100,201,25),'learning_rate':[0.01,0.1,1],
             'max_depth':[7,8,9]}

para_CB = {'iterations':[500,600,700,800],'learning_rate':[0.01,0.1,1],'depth':[5,6,7,8]}

param_SVC = {'C': [1,5,10],'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],'degree':[1,2,3,4]}

para_GBM = {'n_estimators':[100,125,150,175],'learning_rate':[0.01,0.1,1],'max_features':[4,5,6,7,8]}

# Mapping paramter grids to respective models
grid_to_model = {'XGBoost Classifier':param_XGB,
                  'GBM Classifier':para_GBM,
                  'Light GBM classifier':para_LGBM,
                  'Catboost classifier':para_CB,
                   'Support Vector Classifier':param_SVC}

model_summ_top5 = pd.DataFrame({'Model_Name':np.zeros(len(top_5_class)),
                               'Validation_metric':np.zeros(len(top_5_class)),
                               'Train_Accuracy':np.zeros(len(top_5_class)),
                               'Test/Val_Accuracy':np.zeros(len(top_5_class)),
                               'Training_time':np.zeros(len(top_5_class))})

# Fitting the model
top_models=[] # save the models  
def execute_model(train_X,train_y,val_x,val_y,val_metric='accuracy'):
    i=0
    for keys,models in top_5_class.items():
        
        # Name of the model
        print(keys)
        start = time.time()
        # Fitting the model
        Model_fit = GridSearchCV(estimator=models,param_grid=grid_to_model[keys],
                                 verbose = -1,n_jobs=-1,
                                scoring='roc_auc',cv=5).fit(train_X,train_y)
        
        end = time.time()
        print('Best paramters:- ',Model_fit.best_params_)
        top_models.append(Model_fit)
        model_summ_top5.loc[i,'Model_Name'] = keys
        model_summ_top5.loc[i,'Validation_metric'] = val_metric
        model_summ_top5.loc[i,'Train_Accuracy'] = accuracy_score(train_y,Model_fit.predict(train_X))
        model_summ_top5.loc[i,'Test/Val_Accuracy'] = accuracy_score(val_y,Model_fit.predict(val_x))
        model_summ_top5.loc[i,'Training_time'] = np.round((end-start)/60,2)
        i=i+1


In [None]:
# Execute models and best params
execute_model(train_X=train_x_std,train_y=train_y,val_x=val_x_std,val_y=val_y)

In [None]:
#  Model summary
model_summ_top5['Diff'] = (model_summ_top5['Test/Val_Accuracy'] - model_summ_top5['Train_Accuracy'])
model_summ_top5.sort_values(by='Diff',asce)

In [None]:
#  Voting classifier on Top 5 models

clf_1 = top_models[0] # XG-Boost
clf_2 =top_models[1] #  GBM
clf_3 = top_models[2] # Light GBM
clf_4 = top_models[3] # CatBoost
clf_5 =top_models[4]  # SVC

# fitting voting classifier on train data
# Higher weightage for those,which have less overfitting
v_clf = VotingClassifier(estimators=[('XGB',clf_1),('GBM',clf_2),('LGBM',clf_3),
                         ('CB',clf_4),('SVC',clf_5)],
                         weights=[1,2,1,2,2]).fit(train_x_std,train_y)

In [None]:
# Accuracy of Voting classifier
print('Accuracy for TRAIN data',accuracy_score(train_y,v_clf.predict(train_x_std)))
print('-'*40)
print('Accuracy for VAL data',accuracy_score(val_y,v_clf.predict(val_x_std)))

In [None]:
# Predicting for test data

# Standardizing the test data
test_x_std = pd.DataFrame(std_train.transform(df_test[df_test.columns.difference(['Transported'])]),
                          columns=train_x.columns)


submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# Predicting for test data
y_predicted = pd.Series(v_clf.predict(test_x_std))

submission['Transported'] = y_predicted
submission.Transported = submission.Transported.map({1:True,0:False})

# Saving output as CSV file
submission.to_csv("submission_3.csv",index=False)