In [None]:
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from imblearn.over_sampling import SMOTE
import itertools
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import time

## data

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
print('train shape : ', train.shape)
print('test shape : ', test.shape)
train.sample(5)

## feature_description
- PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
- CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- Destination - The planet the passenger will be debarking to.
- Age - The age of the passenger.
- VIP - Whether the passenger has paid for special VIP service during the voyage.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- Name - The first and last names of the passenger.
- Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
# check missing values of train
missing = pd.DataFrame(train.isnull().sum().sort_values(), columns = ['sum_missing'])
missing['percentage_null'] = missing['sum_missing']/train.shape[0]*100
missing

In [None]:
# check missing values of test
missing = pd.DataFrame(test.isnull().sum().sort_values(), columns = ['sum_missing'])
missing['percentage_null'] = missing['sum_missing']/test.shape[0]*100
missing

In [None]:
# checking duplicated rows
print(f'duplicated rows in train : {train.duplicated().sum()}  {train.duplicated().sum()/train.shape[0]*100}%')
print(f'duplicated rows in test : {test.duplicated().sum()}  {test.duplicated().sum()/test.shape[0]*100}%')

In [None]:
# checking datatypes
train.info()

**We will need to transform the data to be numeric (int64 or float64) so that we can train machine learning models. These models (in general) don't work on text.**

In [None]:
num_data = train.select_dtypes(include = 'number')
num_data

In [None]:
train.nunique()

**There are 6 continuous features, 4 categorical features (excluding the target) and 3 descriptive/qualitative features.**

## EDA

In [None]:
# target distribution
plt.figure(figsize = (6, 6))
plt.pie(train['Transported'].value_counts(), explode = [0, 0.1], shadow = True, autopct = '%.2f%%', labels = ['True', 'False'])
plt.title('target_distribution')
plt.show()

**The target is highly balanced, so we luckily don't have to consider techniques like under/over-sampling**

### cotinuous features

In [None]:
plt.figure(figsize = (10, 6))
sns.histplot(data = train, x = 'Age', hue = 'Transported', kde = True)
plt.title('Age Distribution')
plt.show()

**Notes:**
- 0-18 year olds were more likely to be transported than not.
- 18-25 year olds were less likely to be transported than not.
- Over 25 year olds were equally likely to be transported than not.

**Insight:**
- Create a new feature that indicates whether the passanger is a child (under 18), adolescent (18-25) or adult (over 25). Then we can drop the age feature to prevent overfitting.

In [None]:
# Expenditures Features
exp_features=['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

fig = plt.figure(figsize = (10, 20))
for i, col in enumerate(exp_features):
    # left figure
    ax = fig.add_subplot(5,2,2*i+1)
    sns.histplot(data = train, x = col, bins = 30, hue = 'Transported', ax = ax)
    # right figure
    ax = fig.add_subplot(5,2,2*i+2)
    sns.histplot(data = train, x = col, bins = 30, hue = 'Transported', kde = True, ax = ax)
    plt.ylim([0, 100])
fig.tight_layout()
plt.show()

**Notes:**

- Most people don't spend any money (as we can see on the left).
- The distribution of spending decays exponentially (as we can see on the right).
- There are a small number of outliers.
- People who were transported tended to spend less.
- RoomService, Spa and VRDeck have different distributions to FoodCourt and ShoppingMall - we can think of this as luxury vs essential amenities.

**Insight:**

- Create a new feature that tracks the total expenditure across all 5 amenities.
- Create a binary feature to indicate if the person has not spent anything. (i.e. total expenditure is 0).

### Categorical features

In [None]:
# Categorical features
cat_features=['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
fig = plt.figure(figsize=(10, 20))
for i, col in enumerate(cat_features):
    ax = fig.add_subplot(4, 1, i+1)
    sns.countplot(data = train, x = col, hue = 'Transported', ax = ax, palette = 'viridis')
fig.tight_layout()
plt.show()

**Notes:**

- VIP does not appear to be a useful feature; the target split is more or less equal.
- CryoSleep appears the be a very useful feature in contrast.

**Insights:**

- We might consider dropping the VIP column to prevent overfitting.

### Qualitative features

We can't plot this data (yet). We need to transform it into more useful features.

In [None]:
# Qualitative features
qual_features=['PassengerId', 'Cabin' ,'Name']

# Preview qualitative features
train[qual_features].head()

Notes:

- PassengerId takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group.
- Cabin takes the form deck/num/side, where side can be either P for Port or S for Starboard.

Insights:

- We can extract the group and group size from the PassengerId feature.
- We can extract the deck, number and side from the cabin feature.
- We could extract the surname from the name feature to identify families.

## Missing values

In [None]:
plt.figure(figsize = (12, 6))
sns.heatmap(train.isna().T, cmap = 'flare')
plt.title('Heatmap of missing values')
plt.show()

Missing values make up about 2% of the data, which is a relatively small amount. For the most part, they don't seem to be happening at the same time, but let's inspect closer.

In [None]:
train['na_counts'] = train.isna().sum(axis = 1)
plt.figure(figsize = (10, 6))
sns.countplot(data = train, x = 'na_counts', hue = 'Transported')
plt.title('Number of missing entries by passenger')
plt.show()
train.drop('na_counts', axis = 1, inplace = True)

This shows that missing values don't favour either outcome in the target and for the most part are isolated. This means it is reasonable to smartly 'guess' alternatives to the missing values as opposed to dropping these passengers entirely and losing a lot of training data.

### Continuous data

In [None]:
# Impute median (for continuous data)
train['Age'].fillna(train['Age'].median(), inplace=True)
test['Age'].fillna(train['Age'].median(), inplace=True) # be careful of data leakage

### Categorical data

In [None]:
# Find mode of each categorical feature
train[['HomePlanet','CryoSleep','Destination','VIP']].mode()

In [None]:
# Impute most frequent category (for categorical data)
train['HomePlanet'].fillna('Earth', inplace=True)
test['HomePlanet'].fillna('Earth', inplace=True)

train['CryoSleep'].fillna(False, inplace=True)
test['CryoSleep'].fillna(False, inplace=True)

train['Destination'].fillna('TRAPPIST-1e', inplace=True)
test['Destination'].fillna('TRAPPIST-1e', inplace=True)

train['VIP'].fillna(False, inplace=True)
test['VIP'].fillna(False, inplace=True)

# Impute 0's (mode) to Exp_features, because we will create a categorical column from this later
for col in exp_features:
    train.loc[train[col].isna(),col]=0
    test.loc[test[col].isna(),col]=0

### Qualitative data

In [None]:
# Impute outliers (for qualitative data)
train['Cabin'].fillna('Z/9999/Z', inplace=True)
test['Cabin'].fillna('Z/9999/Z', inplace=True)

train['Name'].fillna('No Name', inplace=True)
test['Name'].fillna('No Name', inplace=True)

## Feature Engineering


### Age status

Indentify children, adolescents and adults.

In [None]:
# New features - training set
train['Under_18']=(train['Age']<18).astype(int)
train['18_to_25']=((train['Age']>=18) & (train['Age']<=25)).astype(int)
train['Over_25']=(train['Age']>25).astype(int)

# New features - test set
test['Under_18']=(test['Age']<18).astype(int)
test['18_to_25']=((test['Age']>=18) & (test['Age']<=25)).astype(int)
test['Over_25']=(test['Age']>25).astype(int)

# Plot distribution of new features
train['Age_plot']=train['Under_18']+2*train['18_to_25']+3*train['Over_25']
plt.figure(figsize=(10, 6))
g=sns.countplot(data=train, x='Age_plot', hue='Transported')
plt.title('Age status distribution')
g.set_xticklabels(['Under 18', '18-25', 'Over 25'])
train.drop('Age_plot', axis=1, inplace=True)

### Expenditure

Calculate total expenditure and identify passengers with no expenditure.

In [None]:
# New features - training set
train['Expenditure']=train[exp_features].sum(axis=1)
train['No_spending']=(train['Expenditure']==0).astype(int)

# New features - test set
test['Expenditure']=test[exp_features].sum(axis=1)
test['No_spending']=(test['Expenditure']==0).astype(int)

# Plot distribution of new features
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.histplot(data=train, x='Expenditure', hue='Transported', kde = True, bins=200)
plt.title('Total expenditure (truncated)')
plt.ylim([0,200])
plt.xlim([0,15000])

plt.subplot(1,2,2)
sns.countplot(data=train, x='No_spending', hue='Transported', palette = 'viridis')
plt.title('No spending indicator')
plt.show()

### Passenger group

Extract passenger group and group size from PassengerId.

In [None]:
# New features - training set
train['Group'] = train['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
train['Group_size']=train['Group'].map(lambda x: train['Group'].value_counts()[x])

# New features - test set
test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)
test['Group_size']=test['Group'].map(lambda x: test['Group'].value_counts()[x])

# Plot distribution of new features
plt.figure(figsize=(12,12))
plt.subplot(2,1,1)
sns.histplot(data=train, x='Group', hue='Transported', bins = 200, kde = True)

plt.subplot(2,1,2)
sns.countplot(data=train, x='Group_size', hue='Transported')
fig.tight_layout()
plt.show()

We can't really use the Group feature because it has too big of a cardinality (6217) and would explode the number of dimensions with one-hot encoding.

The Group size on the other hand should be a useful feature. In fact, we can compress the feature further by creating a 'Solo' column that tracks whether someone is travelling on their own or not. The figure on the right shows that group size=1 is less likely to be transported than group size>1.

In [None]:
# New feature
train['Solo']=(train['Group_size']==1).astype(int)
test['Solo']=(test['Group_size']==1).astype(int)

# New feature distribution
plt.figure(figsize=(10,6))
sns.countplot(data=train, x='Solo', hue='Transported')
plt.title('Passenger travelling solo or not')
plt.ylim([0,3000])

### Cabin location

Extract deck, number and side from cabin feature.

In [None]:
# New features - training set
train['Cabin_deck'] = train['Cabin'].apply(lambda x: x.split('/')[0])
train['Cabin_number'] = train['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
train['Cabin_side'] = train['Cabin'].apply(lambda x: x.split('/')[2])

# New features - test set
test['Cabin_deck'] = test['Cabin'].apply(lambda x: x.split('/')[0])
test['Cabin_number'] = test['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
test['Cabin_side'] = test['Cabin'].apply(lambda x: x.split('/')[2])

# Plot distribution of new features
fig=plt.figure(figsize=(12,18))
plt.subplot(3,1,1)
sns.countplot(data=train, x='Cabin_deck', hue='Transported', order=['A','B','C','D','E','F','G','T','Z'])
plt.title('Cabin deck')

plt.subplot(3,1,2)
sns.histplot(data=train, x='Cabin_number', hue='Transported',binwidth=20, kde = True)
plt.vlines(300, ymin=0, ymax=200, color='black')
plt.vlines(600, ymin=0, ymax=200, color='black')
plt.vlines(900, ymin=0, ymax=200, color='black')
plt.vlines(1200, ymin=0, ymax=200, color='black')
plt.vlines(1500, ymin=0, ymax=200, color='black')
plt.vlines(1800, ymin=0, ymax=200, color='black')
plt.title('Cabin number')
plt.xlim([0,2000])

plt.subplot(3,1,3)
sns.countplot(data=train, x='Cabin_side', hue='Transported')
plt.title('Cabin side')
fig.tight_layout()

**Wow, this is interesting!** It appears that Cabin_number is grouped into chunks of 300 cabins. This means we can compress this feature into a categorical one, which indicates which chunk each passenger is in.

**Other notes:** The cabin deck 'T' seems to be an outlier (there are only 5 samples).

In [None]:
# New features - training set
train['Cabin_region1']=(train['Cabin_number']<300).astype(int)   # one-hot encoding
train['Cabin_region2']=((train['Cabin_number']>=300) & (train['Cabin_number']<600)).astype(int)
train['Cabin_region3']=((train['Cabin_number']>=600) & (train['Cabin_number']<900)).astype(int)
train['Cabin_region4']=((train['Cabin_number']>=900) & (train['Cabin_number']<1200)).astype(int)
train['Cabin_region5']=((train['Cabin_number']>=1200) & (train['Cabin_number']<1500)).astype(int)
train['Cabin_region6']=((train['Cabin_number']>=1500) & (train['Cabin_number']<1800)).astype(int)
train['Cabin_region7']=(train['Cabin_number']>=1800).astype(int)

# New features - test set
test['Cabin_region1']=(test['Cabin_number']<300).astype(int)   # one-hot encoding
test['Cabin_region2']=((test['Cabin_number']>=300) & (test['Cabin_number']<600)).astype(int)
test['Cabin_region3']=((test['Cabin_number']>=600) & (test['Cabin_number']<900)).astype(int)
test['Cabin_region4']=((test['Cabin_number']>=900) & (test['Cabin_number']<1200)).astype(int)
test['Cabin_region5']=((test['Cabin_number']>=1200) & (test['Cabin_number']<1500)).astype(int)
test['Cabin_region6']=((test['Cabin_number']>=1500) & (test['Cabin_number']<1800)).astype(int)
test['Cabin_region7']=(test['Cabin_number']>=1800).astype(int)

# Plot distribution of new features
plt.figure(figsize=(12,6))
train['Cabin_regions_plot']=(train['Cabin_region1']+2*train['Cabin_region2']+3*train['Cabin_region3']+4*train['Cabin_region4']+5*train['Cabin_region5']+6*train['Cabin_region6']+7*train['Cabin_region7']).astype(int)
sns.countplot(data=train, x='Cabin_regions_plot', hue='Transported', palette = 'viridis')
plt.title('Cabin regions')
train.drop('Cabin_regions_plot', axis=1, inplace=True)

### Last name

Calculate family size from last name.

In [None]:
# New features - training set
train['Surname']=train['Name'].str.split().str[-1]
train['Family_size']=train['Surname'].map(lambda x: train['Surname'].value_counts()[x])

# New features - test set
test['Surname']=test['Name'].str.split().str[-1]
test['Family_size']=test['Surname'].map(lambda x: test['Surname'].value_counts()[x])

# Set outliers (no name) to have no family
train.loc[train['Family_size']==200,'Family_size']=0
test.loc[test['Family_size']==200,'Family_size']=0

# New feature distribution
plt.figure(figsize=(12,6))
sns.countplot(data=train, x='Family_size', hue='Transported')
plt.show()

## Preprocessing

**Drop unwanted features**

In [None]:
# Drop qualitative/redundant/high cardinality features
train.drop(['PassengerId', 'Cabin', 'Name', 'Surname', 'VIP', 'Group', 'Cabin_number'], axis=1, inplace=True)
test.drop(['PassengerId', 'Cabin', 'Name', 'Surname', 'VIP', 'Group', 'Cabin_number'], axis=1, inplace=True)

# Preview resulting training set
train.head()

We still need to encode the categorical columns.

### Labels and features

In [None]:
# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.utils import resample

In [None]:
y=train['Transported'].copy().astype(int)
X=train.drop('Transported', axis=1).copy()
X_test=test.copy()

### Encoding and scaling

We will use column transformers to be more professional. It's also good practice.

In [None]:
# Indentify numerical and categorical columns
numerical_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]

# Scale numerical data to have mean=0 and variance=1
numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# One-hot encode categorical data
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(drop='if_binary', handle_unknown='ignore',sparse=False))])

# Combine preprocessing
ct = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='passthrough')

# Apply preprocessing
X = ct.fit_transform(X)
X_test = ct.transform(X_test)

# Print new shape
print('Training set shape:', X.shape)

## PCA

Just for fun, let's look at the transformed data in PCA space. This gives a low dimensional representation of the data, which preserves local and global structure.

In [None]:
pca = PCA(n_components=3)
components = pca.fit_transform(X)

total_var = pca.explained_variance_ratio_.sum() * 100

fig = px.scatter_3d(
    components, x=0, y=1, z=2, color=y, size=0.1*np.ones(len(X)), opacity = 1,
    title=f'Total Explained Variance: {total_var:.2f}%',
    labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'},
    width=800, height=500
)
fig.show()

In [None]:
# Explained variance (how important each additional principal component is)
pca = PCA().fit(X)
fig, ax = plt.subplots(figsize=(15,6))
xi = np.arange(1, 1+X.shape[1], step=1)
yi = np.cumsum(pca.explained_variance_ratio_)
plt.plot(xi, yi, marker='o', linestyle='--', color='b')

# Aesthetics
plt.ylim(0.0,1.1)
plt.xlabel('Number of Components')
plt.xticks(np.arange(1, 1+X.shape[1], step=1))
plt.ylabel('Cumulative variance (%)')
plt.title('Explained variance by each component')
plt.axhline(y=1, color='r', linestyle='-')
plt.text(0.5, 0.85, '100% cut-off threshold', color = 'red')
ax.grid(axis='x')

### Create a validation set

We will use this to choose which model(s) to use

In [None]:
# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,train_size=0.8,test_size=0.2,random_state=0)

## Model selection

### To briefly mention the algorithms we will use,

**Logistic Regression:** Unlike regression which uses Least Squares, the model uses Maximum Likelihood to fit a sigmoid-curve on the target variable distribution. It uses a logistic function, and most commonly used when the data in question has binary output.

**K-Nearest Neighbors (KNN):** KNN predicts by selecting the majority class of the k-nearest neighbours. The metric used is usually Euclidean distance. It is a simple and effective algorithm but can be sensitive by many factors, e.g. the value of k, the preprocessing done to the data and the metric used.

**Support Vector Machine (SVM):** SVM finds the optimal hyperplane that seperates the data in the feature space. Predictions are made by looking at which side of the hyperplane the test point lies on. Ordinary SVM assumes the data is linearly separable, which is not always the case. A kernel trick can be used when this assumption fails to transform the data into a higher dimensional space where it is linearly seperable. SVM is a popular algorithm because it is computationally effecient and produces very good results.

**Random Forest (RF):** RF is a reliable ensemble of decision trees, which can be used for regression or classification problems. Here, the individual trees are built via bagging (i.e. aggregation of bootstraps which are nothing but multiple train datasets created via sampling with replacement) and split using fewer features. The resulting diverse forest of uncorrelated trees exhibits reduced variance; therefore, is more robust towards change in data and carries its prediction accuracy to new data. It works well with both continuous & categorical data.

**Extreme Gradient Boosting (XGBoost):** XGBoost is similar to RF in that it is made up of an ensemble of decision-trees. The difference arises in how those trees as derived. XGboost uses extreme gradient boosting when optimising its objective function. It often produces the best results but is relatively slow compared to other gradient boosting algorithms.

**Light Gradient Boosting Machine (LGBM):** LGBM works essentially the same as XGBoost but with a different boosting technique. It usually produces similar results to XGBoost but is significantly faster.

**Categorical Boosting (CatBoost):** CatBoost is an open source algorithm based on gradient boosted decision trees. It supports numerical, categorical and text features. It works well with heterogeneous data and even relatively small data. Informally, it tries to take the best of both worlds from XGBoost and LGBM.

**Naive Bayes (NB):** Naive Bayes learns how to classify samples by using Bayes' Theorem, which uses prior information to 'update' the probability of an event by incoorporateing this information in a clever way. The algorithm is quite fast but a downside is that it assumes the input features are independent, which is not always the case.


We will train these models and evaluate them on the validation set to then choose which ones to carry through to the next stage (cross validation).

### Define classifiers

In [None]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Classifiers
classifiers = {
    "LogisticRegression" : LogisticRegression(random_state=0),
    "KNN" : KNeighborsClassifier(),
    "SVC" : SVC(random_state=0, probability=True),
    "RandomForest" : RandomForestClassifier(random_state=0),
    #"XGBoost" : XGBClassifier(random_state=0, use_label_encoder=False, eval_metric='logloss'), # XGBoost takes too long
    "LGBM" : LGBMClassifier(random_state=0),
    "CatBoost" : CatBoostClassifier(random_state=0, verbose=False),
    "NaiveBayes": GaussianNB()
}

# Grids for grid search
LR_grid = {'penalty': ['l1','l2'],
           'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
           'max_iter': [50, 100, 150]}

KNN_grid = {'n_neighbors': [3, 5, 7, 9],
            'p': [1, 2]}

SVC_grid = {'C': [0.25, 0.5, 0.75, 1, 1.25, 1.5],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']}

RF_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],
        'max_depth': [4, 6, 8, 10, 12]}

boosted_grid = {'n_estimators': [50, 100, 150, 200],
        'max_depth': [4, 8, 12],
        'learning_rate': [0.05, 0.1, 0.15]}

NB_grid={'var_smoothing': [1e-10, 1e-9, 1e-8, 1e-7]}

# Dictionary of all grids
grid = {
    "LogisticRegression" : LR_grid,
    "KNN" : KNN_grid,
    "SVC" : SVC_grid,
    "RandomForest" : RF_grid,
    "XGBoost" : boosted_grid,
    "LGBM" : boosted_grid,
    "CatBoost" : boosted_grid,
    "NaiveBayes": NB_grid
}

**Train and evaluate models**

Train models with grid search (but no cross validation so it doesn't take too long) to get a rough idea of which are the best models for this dataset.

In [None]:
i=0
clf_best_params=classifiers.copy()
valid_scores=pd.DataFrame({'Classifer':classifiers.keys(), 'Validation accuracy': np.zeros(len(classifiers)), 'Training time': np.zeros(len(classifiers))})
for key, classifier in classifiers.items():
    start = time.time()
    clf = GridSearchCV(estimator=classifier, param_grid=grid[key], n_jobs=-1, cv=None)

    # Train and score
    clf.fit(X_train, y_train)
    valid_scores.iloc[i,1]=clf.score(X_valid, y_valid)

    # Save trained model
    clf_best_params[key]=clf.best_params_
    
    # Print iteration and training time
    stop = time.time()
    valid_scores.iloc[i,2]=np.round((stop - start)/60, 2)
    
    print('Model:', key)
    print('Training time (mins):', valid_scores.iloc[i,2])
    print('')
    i+=1

In [None]:
# Show results
valid_scores

Motivated by this, we will take RandomForest, LGBM and CatBoost to the final stage of modelling

In [None]:
# Show best parameters from grid search
clf_best_params

## Modelling

We can finally train our best model on the whole training set using cross validation and ensembling predictions together to produce the most confident predictions.

**Define best models**

In [None]:
# Classifiers
best_classifiers = {
    "RandomForest" : RandomForestClassifier(**clf_best_params["RandomForest"], random_state=0),
    "LGBM" : LGBMClassifier(**clf_best_params["LGBM"], random_state=0),
    "CatBoost" : CatBoostClassifier(**clf_best_params["CatBoost"], verbose=False, random_state=0),
}

**Cross validation and ensembling predictions**

Predictions are ensembled together using soft voting. This averages the predicted probabilies to produce the most confident predictions.

In [None]:
# Number of folds in cross validation
FOLDS=10

preds=np.zeros(len(X_test))
for key, classifier in best_classifiers.items():
    start = time.time()
    
    # 5-fold cross validation
    cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=0)
    
    score=0
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        # Get training and validation sets
        X_train, X_valid = X[train_idx], X[val_idx]
        y_train, y_valid = y[train_idx], y[val_idx]

        # Train model
        clf = classifier
        clf.fit(X_train, y_train)

        # Make predictions and measure accuracy
        preds += clf.predict_proba(X_test)[:,1]
        score += clf.score(X_valid, y_valid)

    # Average accuracy    
    score=score/FOLDS
    
    # Stop timer
    stop = time.time()

    # Print accuracy and time
    print('Model:', key)
    print('Average validation accuracy:', np.round(100*score,2))
    print('Training time (mins):', np.round((stop - start)/60,2))
    print('')
    
# Ensemble predictions
preds=preds/(FOLDS*len(best_classifiers))

## Submission

### Post processing

In [None]:
# Round predictions to nearest integer
preds=np.round(preds).astype(int)

### Submit predictions

In [None]:
# Sample submission (to get right format)
sub=pd.read_csv('../input/spaceship-titanic/sample_submission.csv')

# Add predictions
sub['Transported']=preds

# Replace 0 to False and 1 to True
sub=sub.replace({0:False,1:True})

# Prediction distribution
plt.figure(figsize=(6,6))
sub['Transported'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Prediction distribution")

In [None]:
# Output to csv
sub.to_csv('submission.csv', index=False)