# Titanic competition

# 1. Importing libraries and loading the data

In [None]:
# Importing the required libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

# Importing the sklearn modules required
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
import joblib

# Importing additional models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
import shap

# Importing optuna for model tuning
import optuna
from optuna.samplers import TPESampler

# To see optuna progress we can comment these rows:
#import warnings
#optuna.logging.set_verbosity(optuna.logging.WARNING)
#warnings.filterwarnings(action='ignore', category=ConvergenceWarning)

# Importing the ensemble builder
from mlens.ensemble import SuperLearner

# Setting the styles
sns.set_theme('notebook')
sns.set_style('darkgrid')
sns.set_palette('bright')
%matplotlib inline

In [None]:
# Loading the data
df = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')

# 2. Exploratory data analysis

Let's have a peek at the data:

In [None]:
df.head(5)

Showing all the metrics visually will give a sense of all the distributions.

In [None]:
def graph(column, ax, continuous=True):
    """
    Short function build a chart to understand the data better
    """
    if continuous:
        sns.histplot(x=df[column], ax=ax)
    else:
        sns.countplot(x=df[column], ax=ax)
    return fig

fig,axs = plt.subplots(ncols=4, nrows=2, figsize=(24,12))
ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8 = axs[0,0], axs[0,1], axs[0,2], axs[0,3], axs[1,0], axs[1,1], axs[1,2], axs[1,3]
graph('Survived',ax1,continuous=False), ax1.set_title('Survival count', fontsize=12, fontweight='bold')
graph('Pclass',ax2,continuous=False), ax2.set_title('Passenger class', fontsize=12, fontweight='bold')
graph('Sex',ax3,continuous=False), ax3.set_title('Passenger sex', fontsize=12, fontweight='bold')
graph('Age',ax4,continuous=True), ax4.set_title('Passenger age', fontsize=12, fontweight='bold')
graph('SibSp',ax5,continuous=False), ax5.set_title('# siblings/spouse', fontsize=12, fontweight='bold')
graph('Parch',ax6,continuous=False), ax6.set_title('# children/parents', fontsize=12, fontweight='bold')
graph('Fare',ax7,continuous=True), ax7.set_title('Fare paid', fontsize=12, fontweight='bold')
graph('Embarked',ax8,continuous=False), ax8.set_title('Embarked', fontsize=12, fontweight='bold');

In [None]:
# Defining a function to add values to my bar graphs:

def add_value_labels(ax, fontsize=12, label_format="{:.0f}", spacing=1):
    """
    Functions to add labels to the end of each bar in a bar chart.
    - ax (matplotlib.axes.Axes): The matplotlib object containing the axes of the plot to annotate.
    - spacing (int): The distance between the labels and the bars
    """

    # For each bar: Place a label
    for rect in ax.patches:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        # Number of points between bar and label:
        space = spacing
        # Vertical alignment for positive values:
        va = 'bottom'

        # If value of bar is negative, place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Use Y value as label and format number with 2 decimal places
        label = label_format.format(y_value)

        # Create annotation
        ax.annotate(
            label,                          # Use `label` as label
            (x_value, y_value),             # Place label at end of the bar
            xytext=(0, space),              # Vertically shift label by `space`
            textcoords="offset points",     # Interpret `xytext` as offset in points
            fontsize=fontsize,              # Font size
            ha='center',                    # Horizontally center label
            va=va)                          # Vertically align label

In [None]:
def graph_overview(column, title, continuous=True):
    """
    Short function to return a count and distribution plot of the metric. 
    - column (string) is the column name of the metric to be analysed
    - title (string) title of the graph
    - continuous (bool) whether the variable is continuous
    - returns two charts 
    """
    
    # Format the fonts
    font = FontProperties(size=12)
    font.set_style('italic')
    
    # Set the figure up
    fig, (ax1,ax2) = plt.subplots(ncols=2, nrows=1, figsize=(18,4))
    
    if continuous:
        # Histogram
        graph1 = sns.histplot(x=df[column], ax=ax1)
        ax1.set_title(title+' histogram\n', fontsize=14, fontweight='bold')
        # Boxplot
        graph2 = sns.boxplot(data=df, x=column, y='Survived', ax=ax2, orient='h')
        ax2.set_title(title+' by survival rate\n', fontsize=14, fontweight='bold')
        ax2.set_ylabel('Survival status', fontproperties=font)
    
    else:
        # Countplot
        graph2 = sns.countplot(x=df[column], ax=ax1)
        ax1.set_title(title+' count\n', fontsize=14, fontweight='bold')
        add_value_labels(ax1, spacing=2)  # Adds the values above the bars
        
        # Bar chart
        survival_rate = df.groupby(column)['Survived'].value_counts(normalize=True).loc[:,1]
        graph2 = sns.barplot(y=survival_rate.values, x=survival_rate.index, ax=ax2)
        ax2.set_title(title+" by survival rate\n", fontsize=14, fontweight='bold')
        ax2.set_ylabel('Survival rate', fontproperties=font)
        add_value_labels(ax2, label_format="{:.2f}")  # Adds the values above the bars
    
    ax1.set_ylabel('Count', fontproperties=font)
    ax1.set_xlabel(title, fontproperties=font)
    ax2.set_xlabel(title, fontproperties=font)

Running through each of the variables in turn and checking how they impact survival rate.

### 2.1. Passenger class

In [None]:
graph_overview('Pclass', 'Passenger class', continuous=False)

As expected, 1st class passengers had the highest survival rates, and wewre more than twice as likely to survive as 3rd class passengers. 

### 2.2. Sex

In [None]:
graph_overview('Sex', 'Passenger sex', continuous=False)

Women and children first! Women are more than 3x as likely to survive as men. 

### 2.3. Age

In [None]:
graph_overview('Age', 'Passenger age', continuous=True)

Younger passengers are slightly more likely to survive but the effect is not as pronounced as class/sex.

### 2.4. Number of siblings/spouse onboard

In [None]:
graph_overview('SibSp', "# siblings/spouse onboard", continuous=False)

Thos with 1 or 2 siblings onboard are much more like to survive than those with none, but less likely once in a large family (3-4). 

### 2.5. Number of children/parents onboard

In [None]:
graph_overview('Parch', '# children/parents onboard', continuous=False)

We see a similar relationship with children/parents as siblings/spouses, i.e. some relatives on board enhances the chance of survival, but only to a point. 

### 2.6. Fare paid

In [None]:
graph_overview('Fare', "Fare paid", continuous=True)

In [None]:
graph_overview('Embarked', "Port of embarkation", continuous=False)

# 3. Feature engineering
Where are the data missing? 

In [None]:
# Return the number of missing data by column
df.isna().sum(axis = 0)

### 3.1. Embarked

Where did the two people who are missing from the 'Embarked' column get onboard?

In [None]:
df[df['Embarked'].isna()].head()

Miss Amelie Icard is clearly French and hence is more likely to have boarding in Cherbourg, but Mrs Stone is probably English and hence more likely to have boarded in Southampton. However, they have the same ticket number and are both in cabin B28, so are unlikely to have boarded at different ports! As the vast majority of passengers boarded in Southampton I will assume they embarked in Southampton.

In [None]:
df.Embarked.fillna('S', inplace=True)

### 3.2. Name/Title
The 'Name' field may have some useful data contained within it, as there may be trends we could identify like title, surname, etc. 

In [None]:
# The name information may have some useful data contained within
df['Title'] = df.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [None]:
def draw_graph(title, x_data, x_label, y_label, x_rotation=0, sorted=False):
    # Let's graph the titles
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(12,4))
    ax.set_title(title+'\n', fontsize=14, fontweight='bold')

    # Format the fonts:
    font = FontProperties(size=14)
    font.set_style('italic')
    plt.xticks(rotation=x_rotation, fontsize=12)
    plt.yticks(fontsize=12)

    # Setting up the plot
    if sorted: sns.countplot(x=x_data, data=df, order = df[x_data].value_counts().index)
    else: sns.countplot(x=x_data, data=df)
    ax.set_ylabel(y_label, fontproperties=font)
    ax.set_xlabel(x_label, fontproperties=font);
    add_value_labels(ax, fontsize=12);

In [None]:
draw_graph('Passenger titles', x_rotation=45, x_data='Title', x_label='Passenger title', y_label='Count', sorted=True)

There are a large number of names, including several unique names. I should therefore rationalise the titles to improve computational efficiency and reduce noise in the feature. 

In [None]:
titles = ['Mr', 'Mrs', 'Miss', 'Master', 'Other']
df.loc[~df.Title.isin(titles), "Title"] = "Other"

Rerunning the analysis to check it has worked.

In [None]:
draw_graph('Passenger titles', x_data='Title', x_label='Passenger title', y_label='Count', sorted=True)

Now there is no useful information to glean from the 'Name' column, I will drop that from the dataset. 

In [None]:
df.drop('Name', axis=1, inplace=True)

Finally, let's see how the survival rate varies with the passengers' titles. 

In [None]:
graph_overview('Title', "Passenger title", continuous=False)

### 3.3. Cabin
Most of the cabin data appear to be missing. Perhaps we can categorise the cabins, split by their first letters.

In [None]:
df['Cabin_let'] = df['Cabin'].str[0]
df['Cabin_let'].value_counts(dropna=False)

Now I will replace the NaNs with 'Unknown' and group the tail of cabin letters as 'Other'

In [None]:
# Fill the unknowns with 'Unknown'
df.Cabin_let.fillna(value='Unknown', inplace=True)

# Now we can drop the old 'Cabin' column and change the name of the 'Cabin_let' column
df.drop('Cabin', axis=1, inplace=True)
df.rename(columns={'Cabin_let': 'Cabin'}, inplace=True)

# Change the tail of the cabin letters to 'Other'
let_to_replace = ['F','G','T']
replace_with = 'Other'

for i in let_to_replace:
    df.Cabin.loc[df['Cabin']==i] = replace_with

# Checking the output:
draw_graph('Passenger cabin', x_data='Cabin', x_label='Passenger cabin', y_label='Count', sorted=True)

In [None]:
graph_overview('Cabin', 'Passenger cabin', continuous=False)

### 3.4. Family
I will now create a `'Family'` column which is simply the sum of the sibling/spouses and children/parent columns and kill the original columns. This will reduce the complexity and should help the predictive power of the model by reducing overfitting.

In [None]:
# Let's create a 'family' column to show who had children/parents/spouse onboard
df['Family'] = df.SibSp + df.Parch
# ...and kill the original columns
df.drop('SibSp', axis=1, inplace=True)
df.drop('Parch', axis=1, inplace=True)
# ...and look at the distribution
graph_overview('Family', '# family members onboard', continuous=False)

Finally, let's create simple none/small/large family groups to simplify the feature.

In [None]:
# Creating the 'None' bucket
df.Family.loc[df['Family']==0] = 'None'

# Creating the 'Small_family' bucket
small_num_to_replace = [1,2,3]
small_fam = 'Small_family'
for i in small_num_to_replace:
    df.Family.loc[df['Family']==i] = small_fam
    
# Creating the 'Large_family' bucket
large_num_to_replace = [4,5,6,7,10]
large_fam = 'Large_family'
for i in large_num_to_replace:
    df.Family.loc[df['Family']==i] = large_fam

# Checking output
graph_overview('Family', '# family members onboard', continuous=False)

### 3.5. Fare

In [None]:
# Exploring the fare distribution 
fig,ax = plt.subplots(ncols=1, nrows=1, figsize=(8,4))
font = FontProperties(size=12)
font.set_style('italic')
ax.set_ylabel('Count', fontproperties=font)
ax.set_xlabel('Fare', fontproperties=font);
sns.histplot(df["Fare"], ax=ax, label="Skewness : %.2f"%(df["Fare"].skew()))
plt.legend(loc="best");

This is a metric with a high degree of skew. This represents a problem for modelling as outliers in our features can have an outsized impact on feature importance and hence final accuracy. I will therefore take the log of fare for my modelling. 

In [None]:
# Taking the log of the fare to reduce skewness:
df["Fare"] = df["Fare"].map(lambda i: np.log(i) if i > 0 else 0)

In [None]:
# Checking that skewness has reduced:
fig,ax = plt.subplots(ncols=1, nrows=1, figsize=(8,4))
font = FontProperties(size=12)
font.set_style('italic')
ax.set_ylabel('Count', fontproperties=font)
ax.set_xlabel('Fare', fontproperties=font);
sns.histplot(df["Fare"], ax=ax, label="Skewness : %.2f"%(df["Fare"].skew()))
plt.legend(loc="best");

### 3.6. Ticket

In [None]:
df.Ticket.describe()

Ticket has 681 unique numbers in 891 records. This means that the vast majority are unique and, from looking at around 100 examples, the only information which could be gleaned from the ticket numbers is class or possible cabin, both of which we have already. As a result, I will drop this metric. 

In [None]:
df.drop('Ticket', axis=1, inplace=True)

### 3.7. Passenger ID

In [None]:
df.PassengerId.value_counts()

All the passenger IDs are unique and hence provide no value as model features. I will therefore drop the column from the dataframe.

In [None]:
df.drop('PassengerId', axis=1, inplace=True)

### 3.8. Age
There are quite a few missing age numbers. However, we could try and predict the age by building a model to fill in these numbers, using the other metrics as training data. 

In [None]:
# Let's split the dataframe based on whether the passenger's age is available
df_test = df.copy().loc[df['Age'].isin([np.nan])]
df_train_val = df.copy().loc[~df['Age'].isin([np.nan])]

In [None]:
# Now we need to define our independent variables
age_var = ['Pclass', 'Sex', 'Fare', 'Embarked', 'Title', 'Family', 'Cabin']
# And label the catgeorical features for Catboost
cat_features = ['Sex', 'Embarked', 'Cabin', 'Title', 'Family']

In [None]:
X = df_train_val[age_var]
y = df_train_val['Age']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialise data
pool_train = Pool(X_train, y_train, cat_features=cat_features)
pool_val = Pool(X_val, y_val, cat_features=cat_features)

# Initialise CatBoostRegressor
cb_model = CatBoostRegressor(iterations=1000,
                          learning_rate=0.03,
                          depth=6,
                          loss_function='RMSE',
                          eval_metric='RMSE',
                          cat_features=cat_features,
                          verbose=0,
                          use_best_model=True
                         )
# Fit model
cb_model.fit(pool_train, eval_set=pool_val);

In [None]:
# Create the test set 
X_test = df_test[age_var]
# Make predictions
y_pred = cb_model.predict(X_test)

Now that I have created a prediction for the missing age range using CatBoost, let's see what the distribution looks like.

In [None]:
# Format the fonts:
font = FontProperties(size=12)
font.set_style('italic')

# Set up the figure:
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(12,4))
ax.set_title('Predicted passenger ages\n', fontsize=14, fontweight='bold')
ax.set_ylabel('Count', fontproperties=font)
ax.set_xlabel('Passenger age', fontproperties=font)
    
# What does the distribution look like? 
sns.histplot(y_pred);

Which features are most important in determining the passengers' missing age?

In [None]:
# Creating the pools for pulling the shap info
pool1 = Pool(data=X_test, label=y_pred, cat_features=cat_features)
# Get the feature importances
shap_info = cb_model.get_feature_importance(data=pool1, type='ShapValues', verbose=0)
shap_values = shap_info[:, : -1]
base_values = shap_info[:, -1]
# Plot the values
shap.summary_plot(shap_values, X_test)

In [None]:
# What are the model's most important features? 
importances = cb_model.get_feature_importance(prettified=True)

In [None]:
# What does the distribution look like? 

# Set up the figure:
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(12,4))
ax.set_title('Importance score\n', fontsize=14, fontweight='bold')
ax.set_ylabel('Importances', fontproperties=font)
ax.set_ylim((0,40))
ax.set_xlabel('Model feature', fontproperties=font)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)

# What does the distribution look like? 
sns.barplot(x=importances['Feature Id'], y=importances['Importances']);
ax.set_xlabel('Model feature', fontproperties=font)
add_value_labels(ax, fontsize=12)

I now need to take the predicted ages, add and index and then add the predicted ages back into the training set.

In [None]:
# Turn the predictions into a data frame
y_df = pd.DataFrame(y_pred, columns=['Age'])
# Add index as a colum
y_df['Index'] = X_test.index
# Set the index
y_df.set_index('Index', inplace=True)
# Put the predicted values back into the test df
df.fillna(y_df, axis=1, inplace=True)

# 4. Modelling
## 4.1. One hot encoding
The first step I need to take is to use one hot encoding so that our models are dealing with only numerical variables. 

In [None]:
cat_features = ['Sex','Embarked','Title','Cabin','Family']
df = pd.get_dummies(df,columns=cat_features,prefix=cat_features)

And now I need to define our training features:

In [None]:
var = ['Pclass','Age','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S',
       'Title_Master','Title_Miss','Title_Mr','Title_Mrs','Title_Other',
       'Cabin_A','Cabin_B','Cabin_C','Cabin_D','Cabin_E','Cabin_Other','Cabin_Unknown',
       'Family_Large_family','Family_None','Family_Small_family']
y_train = df['Survived']
X_train = df[var]

## 4.2. Scaling the data
Next I will scale the feature data.

In [None]:
scale_data = True
shuffle = True # Needs to be turned off if we are scaling the data as the ensemble model fails to fit otherwise
scaler = MinMaxScaler()

# Scaling the data
if scale_data:
    X_train = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(data=X_train, columns=var)

## 4.1. Model selection
I will trial the following classification models:
1. Support vector classifier
2. Decision Tree
3. AdaBoost
4. Random Forest
5. Extra Trees
6. Gradient Boosting
7. Multiple layer perceprton (neural network)
8. K-nearest neighbours
9. Logistic regression
10. Linear Discriminant Analysis
11. Gaussian Naive Bayes
12. Bagging classifier
13. Light GBM
14. XGBoost
15. Catboost

Let's perform a cross validation with K-folds and compare model performance.

In [None]:
# Let's set the class weights to add to all our models:
n = y_train.sum()/len(y_train)
class_weights = {0:(1-n), 1:n};

In [None]:
# Setting our random state:
random_state = 42

In [None]:
# Cross validate model with Kfold stratified cross val
folds = 10 
kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=random_state)

In [None]:
%%time

# Setting up all the classifiers to iterate over below:
classifiers = [SVC(random_state=random_state, max_iter=5000), 
               DecisionTreeClassifier(random_state=random_state),
               AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),
                                  random_state=random_state,learning_rate=0.1),
               RandomForestClassifier(random_state=random_state),
               ExtraTreesClassifier(random_state=random_state,class_weight=class_weights),
               GradientBoostingClassifier(random_state=random_state),
               KNeighborsClassifier(),
               LogisticRegression(random_state=random_state, max_iter=500),
               LinearDiscriminantAnalysis(),
               GaussianNB(),
               BaggingClassifier(n_estimators=100),
               MLPClassifier(random_state=random_state, max_iter=1000),
               LGBMClassifier(random_state=random_state),
               XGBClassifier(random_state=random_state, use_label_encoder=False, 
                             eval_metric='logloss', silent=1),
               CatBoostClassifier(random_state=random_state, early_stopping_rounds=100, iterations=1000)
              ]

# Appending each classifier's results to a list
cv_results = []
for classifier in classifiers:
    cv_results.append(cross_val_score(classifier, X_train, y=y_train, scoring="accuracy", cv=kfold, n_jobs=4))

# Calculating the mean performance and standard deviation for comparison later:
cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())
    
# Creating a dataframe of the results:
model_names = ["SVC","DecisionTree","AdaBoost","RandomForest","ExtraTrees","GradientBoosting",
               "KNeighbours","LogisticRegression",
               "LinearDiscriminantAnalysis", "GaussianNB", "BaggingClassifier",
               "MultipleLayerPerceptron",
               "LightGBM", "XGBoost", "CatBoost"
              ]
cv_res = pd.DataFrame({"CV_means":cv_means,"CV_std": cv_std,"Algorithm":model_names})

Looking at a mean of the accuracy scores across the 10 folds and including standard deviation of model performance too:

In [None]:
# Plotting the results:
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(16,4))
ax.set_title('Model performance\n', fontsize=14, fontweight='bold')
ax.set_ylim((0.75,0.9))
plt.yticks(fontsize=12)
plt.xticks(fontsize=12, rotation=45, horizontalalignment='right')
sns.barplot(y="CV_means", x="Algorithm", data=cv_res, **{'yerr':cv_std}, errcolor='r', errwidth=1, capsize=5)
ax.set_xlabel('Algorithm', fontproperties=font)
ax.set_ylabel('Cross validation - mean score', fontproperties=font)
add_value_labels(ax, fontsize=12, spacing=5, label_format="{:.2f}");  # Adds the values above the bars

Next, we should check how correlated the predictions are, as ensemble models composed of uncorrelated input models often outperform those that are highly correlated.

In [None]:
# Setting up an empty dictionary to include our predictions
predictions = {}

# Loop through all the models and add predictions to the dictionary
for index, classifier in enumerate(classifiers):
    model_name = model_names[index]
    fit_model = classifier.fit(X_train, y_train)
    predictions[model_name] = fit_model.predict(X_train)

# Turn our base prediction dictionary into a dataframe
base_predictions_train = pd.DataFrame(predictions)

# Calculate the correlations between all the models' predictions
data = base_predictions_train.corr()

In [None]:
# Setting up the plot
fig,ax = plt.subplots(ncols=1, nrows=1, figsize=(16,8))

# Set the font options:
font = FontProperties(size=12)
font.set_style('italic')

# Build the chart
fig = sns.heatmap(data=data, annot=True, cmap='rainbow', linewidths=1, linecolor='white',
                  fmt=".2f", annot_kws={"size":12}, cbar_kws={'label': '\nCorrelation of predictions'})
fig.figure.axes[-1].yaxis.label.set_size(14)
    
# Setting the font size for the colorbar 
cbar = fig.collections[0].colorbar
cbar.ax.tick_params(labelsize=12)

# X-axis and tick mark labels
fig.set_xlabel('Model', fontproperties=font)
plt.xticks(fontsize=12)
    
# y-axis and tick mark labels
fig.set_ylabel('Model', horizontalalignment='center', fontproperties=font)
plt.yticks(fontsize=12, verticalalignment='center')

# Setting the graph title
ax.set_title("Correlation of respective model predictions\n", fontsize=16, fontweight="bold");

So now I will calculate the mean correlation of each of the models and subtract it from 1 so that the higher the correlations the better.

In [None]:
# Creating an empty dictionary:
model_correlation_dict = {}
# Adding the model names and means to the dictionary:
for name in model_names:
    model_correlation_dict[name] = 1- data[name].mean()
# Turning the dictionary into a dataframe:
model_correlation = pd.Series(model_correlation_dict)

In [None]:
# Plotting the outputs
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(16,4))

# Setting the title
ax.set_title('Average model prediction correlations\n', fontsize=14, fontweight='bold')

# Setting y-axis range
ax.set_ylim((0.1,0.3))

# Setting label ticks size:
plt.yticks(fontsize=12)
plt.xticks(fontsize=12, rotation=45, horizontalalignment='right')

# Creating the bar plot
sns.barplot(x=model_correlation.index, y=model_correlation);

# Creating the axis labels:
ax.set_xlabel('Model', fontproperties=font)
ax.set_ylabel('1 - Mean correlation with other models', fontproperties=font)

# Adding the data label
add_value_labels(ax, fontsize=12, spacing=5, label_format="{:.2f}");  # Adds the values above the bars

So as we can see from the above there are a number of models which are less correlated than the others, notably the SVC, K-nearest neighbours, and Gaussian Naive Bayes. Let's compare the correlation metrics to the accuracy scores to see which models could form the best elements of an ensemble model.

In [None]:
# Create a new df with the model correlation and CV performance data
scatter_data = pd.concat((model_correlation, cv_res.set_index('Algorithm')), axis=1)
# Drop the standard deviation of the CV performance as we don't need it for this analysis
scatter_data.drop('CV_std', axis=1, inplace=True)
# Rename the remaining columns
scatter_data.rename(columns={0: 'Correlation', 'CV_means':'Mean CV score'}, inplace=True)
# Add the model name as another column for use in the scatter chart below:
scatter_data['Model'] = scatter_data.index

In [None]:
def scatter_text(x, y, text_column, data, title, xlabel, ylabel):
    """Scatter plot with country codes on the x y coordinates
       Based on this answer: https://stackoverflow.com/a/54789170/2641825"""

    # Creating the figure:
    fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(16,10))

    # Create the scatter plot    
    p1 = sns.scatterplot(x=x, y=y, data=data, ax=ax, s=100, legend=False)
    
    # Set title
    ax.set_title(title, fontsize=14, fontweight='bold')
    
    # Creating the axis labels:
    ax.set_xlabel(xlabel, fontproperties=font)
    ax.set_ylabel(ylabel, fontproperties=font)
    
    # Setting label ticks size:
    plt.yticks(fontsize=12)
    plt.xticks(fontsize=12)

    # Add text besides each point
    for line in range(0,data.shape[0]):
         p1.text(data[x][line]+0.002, data[y][line], 
                 data[text_column][line], 
                 horizontalalignment='left', 
                 size='small', 
                 color='black',
                 #weight='semibold'
                )
    
    return p1

# Draw the scatter chart:
scatter_text(x='Correlation', 
             y='Mean CV score', 
             text_column='Model', 
             data=scatter_data, 
             title='Model performance vs. correlation\n', 
             ylabel='Mean CV score', 
             xlabel='1- mean correlation with other models');

So based on the above outputs, I will choose Gradient Boosting, CatBoost, Bagging classifier, LightGBM, MLP, Logistic Regression and LDA as they have the best cross-validation scores. I will also use SVC as it is uncorrelated with the other models and XGBoost as its performance can be significantly enhanced through tuning.

## 4.2. Hyperparameter tuning
### 4.2.1. Training and validation data sets

In [None]:
# First let's split our training data into training and validation:
X, X_val, y, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=random_state)

### 4.2.2. Optimiser class
Setting up an optimiser class

In [None]:
class Optimiser:
    def __init__(self, metric, trials=100):
        self.metric = metric
        self.trials = trials
        self.sampler = TPESampler(seed=42)
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(X, y)
        preds = model.predict(X_val)
        if self.metric == 'acc':
            return accuracy_score(y_val, preds)
        else:
            return f1_score(y_val, preds)
            
    def optimise(self):
        study = optuna.create_study(direction="maximize", sampler=self.sampler)
        study.optimize(self.objective, n_trials=self.trials, timeout=900)
        return study.best_params

## 4.3. XGBoost

In [None]:
%%time

# Setting up the classifier:
xgb = XGBClassifier(random_state=random_state, use_label_encoder = False, eval_metric='logloss')
xgb.fit(X, y)
preds = xgb.predict(X_val)

# Calculating model scores:
xgb_acc_score = accuracy_score(y_val, preds)
xgb_f1_score = f1_score(y_val, preds)

def create_model(trial):
    # Set up the trial parameters:
    max_depth = trial.suggest_int("max_depth", 2, 8)
    n_estimators = trial.suggest_int("n_estimators", 1, 1500)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 1)
    gamma = trial.suggest_uniform('gamma', 0.0000001, 1)
    subsample = trial.suggest_uniform('subsample', 0.8, 1.0)
    
    # Set up the model:
    model = XGBClassifier(max_depth=max_depth, 
                          n_estimators=n_estimators, 
                          learning_rate=learning_rate,
                          gamma=gamma, 
                          subsample=subsample,
                          use_label_encoder=False,
                          eval_metric='logloss',
                          random_state=random_state
                         )
    return model

# Optimise on accuracy
optimiser = Optimiser('acc')
xgb_acc_params = optimiser.optimise()
xgb_acc_params['random_state'] = random_state
xgb_acc_params['use_label_encoder'] = False
xgb_acc_params['eval_metric'] = 'logloss'
xgb_acc = XGBClassifier(**xgb_acc_params)
xgb_acc.fit(X, y)
preds = xgb_acc.predict(X_val)

# Calculating model scores:
xgb_acc_acc_score = accuracy_score(y_val, preds)
xgb_acc_f1_score = f1_score(y_val, preds)

# Optimise on F1 score
optimiser = Optimiser('f1')
xgb_f1_params = optimiser.optimise()
xgb_f1_params['random_state'] = random_state
xgb_f1_params['use_label_encoder'] = False
xgb_f1_params['eval_metric'] = 'logloss'
xgb_f1 = XGBClassifier(**xgb_f1_params)
xgb_f1.fit(X, y)
preds = xgb_f1.predict(X_val)

# Calculating model scores:
xgb_f1_acc_score = accuracy_score(y_val, preds)
xgb_f1_f1_score = f1_score(y_val, preds)

In [None]:
# Print pre-optimised accuracy
print('Pre-optimised XGBoost:')
print(' - accuracy:', f'{xgb_acc_score:.3f}')
print(' - f1-score:', f'{xgb_f1_score:.3f}' + '\n')

# Print the output:
print('Accuracy optimised XGBoost:')
print(' - accuracy:', f'{xgb_acc_acc_score:.3f}')
print(' - f1-score:', f'{xgb_acc_f1_score:.3f}' + '\n')

# Print the output:
print('F1 optimised XGBoost:')
print(' - accuracy:', f'{xgb_f1_acc_score:.3f}')
print(' - f1-score:', f'{xgb_f1_f1_score:.3f}'+ '\n')

## 4.4. LightGBM

In [None]:
%%time

# Setting up the classifier:
lgb = LGBMClassifier(random_state=random_state)
lgb.fit(X, y)
preds = lgb.predict(X_val)

# Calculating model scores:
lgb_acc_score = accuracy_score(y_val, preds)
lgb_f1_score = f1_score(y_val, preds)

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 8)
    n_estimators = trial.suggest_int("n_estimators", 1, 1000)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    num_leaves = trial.suggest_int("num_leaves", 2, 3000)
    min_child_samples = trial.suggest_int('min_child_samples', 3, 200)
    model = LGBMClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        num_leaves=num_leaves, 
        min_child_samples=min_child_samples,
        random_state=random_state
    )
    return model

# Optimise on accuracy
optimiser = Optimiser('acc')
lgb_acc_params = optimiser.optimise()
lgb_acc_params['random_state'] = random_state
lgb_acc = LGBMClassifier(**lgb_acc_params)
lgb_acc.fit(X, y)
preds = lgb_acc.predict(X_val)

# Calculating model scores:
lgb_acc_acc_score = accuracy_score(y_val, preds)
lgb_acc_f1_score = f1_score(y_val, preds)

# Optimise on F1 score
optimiser = Optimiser('f1')
lgb_f1_params = optimiser.optimise()
lgb_f1_params['random_state'] = random_state
lgb_f1 = LGBMClassifier(**lgb_f1_params)
lgb_f1.fit(X, y)
preds = lgb_f1.predict(X_val)

# Calculating model scores:
lgb_f1_acc_score = accuracy_score(y_val, preds)
lgb_f1_f1_score = f1_score(y_val, preds)

In [None]:
# Printing the output 
print('Pre-optimised LightGBM:')
print(' - accuracy:', f'{lgb_acc_score:.3f}')
print(' - f1-score:', f'{lgb_f1_score:.3f}' + '\n')

# Print the output:
print('Accuracy optimised LightGBM:')
print(' - accuracy:', f'{lgb_acc_acc_score:.3f}')
print(' - f1-score:', f'{lgb_acc_f1_score:.3f}' + '\n')

# Print the output:
print('F1 optimised LightGBM:')
print(' - accuracy:', f'{lgb_f1_acc_score:.3f}')
print(' - f1-score:', f'{lgb_f1_f1_score:.3f}'+ '\n')

## 4.5. Gradient boosting

In [None]:
%%time

# Setting up the classifier:
gbc = GradientBoostingClassifier()
gbc.fit(X, y)
preds = gbc.predict(X_val)

# Calculating model scores:
gbc_acc_score = accuracy_score(y_val, preds)
gbc_f1_score = f1_score(y_val, preds)

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 10)
    n_estimators = trial.suggest_int("n_estimators", 1, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.00001, 1)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 19, 3)
    max_features = trial.suggest_uniform('max_features', 0.1, 1.0)
    model = GradientBoostingClassifier(
        max_depth=max_depth, 
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
        )
    return model

# Optimising on accuracy
optimiser = Optimiser('acc')
gbc_acc_params = optimiser.optimise()
gbc_acc_params['random_state'] = random_state
gbc_acc = GradientBoostingClassifier(**gbc_acc_params)
gbc_acc.fit(X, y)
preds = gbc_acc.predict(X_val)

# Calculating model scores:
gbc_acc_acc_score = accuracy_score(y_val, preds)
gbc_f1_acc_score = f1_score(y_val, preds)

# Optimising on F1 score:
optimiser = Optimiser('f1')
gbc_f1_params = optimiser.optimise()
gbc_f1_params['random_state'] = random_state
gbc_f1 = GradientBoostingClassifier(**gbc_f1_params)
gbc_f1.fit(X, y)
preds = gbc_f1.predict(X_val)

# Calculating model scores:
gbc_f1_acc_score = accuracy_score(y_val, preds)
gbc_f1_f1_score = f1_score(y_val, preds)

In [None]:
# Printing scores
print('Pre-optimised Gradient Boosting classifier:')
print(' - accuracy:', f'{gbc_acc_score:.3f}')
print(' - f1-score:', f'{gbc_f1_score:.3f}' + '\n')

# Print the output:
print('Accuracy optimised Gradient Boosting:')
print(' - accuracy:', f'{gbc_acc_acc_score:.3f}')
print(' - f1-score:', f'{gbc_f1_acc_score:.3f}' + '\n')

# Print the output:
print('F1 optimised Gradient Boosting:')
print(' - accuracy:', f'{gbc_f1_acc_score:.3f}')
print(' - f1-score:', f'{gbc_f1_f1_score:.3f}'+ '\n')

## 4.6. Catboost

In [None]:
%%time

# Setting up the classifier:
cb = CatBoostClassifier(random_state=random_state, verbose=0, early_stopping_rounds=100, 
                        iterations=1000, 
                        boosting_type = 'Plain'
                       )
cb.fit(X, y)
preds = cb.predict(X_val)

# Calculating model scores:
cb_acc_score = accuracy_score(y_val, preds)
cb_f1_score = f1_score(y_val, preds)

# Print output for CatBoost
print('CatBoost classifier:')
print(' - accuracy:', f'{cb_acc_score:.3f}')
print(' - f1-score:', f'{cb_f1_score:.3f}' + '\n')

## 4.7. Logistic regression

In [None]:
%%time

# Instantiate the model
logreg = LogisticRegression(random_state=random_state, max_iter=2000)
logreg.fit(X, y)
preds = logreg.predict(X_val)

# Calculating model scores:
lr_acc_score = accuracy_score(y_val, preds)
lr_f1_score = f1_score(y_val, preds)

print('Logistic regression accuracy score:', f'{lr_acc_score:.3f}')
print('Logistic regression f1-score: ', f'{lr_f1_score:.3f}'+ '\n')

## 4.8. Multi layer perceptron

In [None]:
%%time

# Setting up the classifier:
mlp = MLPClassifier(random_state=random_state, max_iter=2000)
mlp.fit(X, y)
preds = mlp.predict(X_val)

# Calculating model scores:
mlp_acc_score = accuracy_score(y_val, preds)
mlp_f1_score = f1_score(y_val, preds)

def create_model(trial):
    learning_rate_init = trial.suggest_uniform("learning_rate_init", 0.0001, 0.01)
    hidden_layer_sizes = trial.suggest_int('hidden_layer_sizes', 8, 128)
    max_iter = trial.suggest_int('max_iter', 500, 3000)
    model = MLPClassifier(
        learning_rate_init = learning_rate_init,
        hidden_layer_sizes = hidden_layer_sizes,
        max_iter = max_iter,
        random_state=random_state
        )
    return model

# Optimising on accuracy:
optimiser = Optimiser('acc')
mlp_acc_params = optimiser.optimise()
mlp_acc_params['random_state'] = random_state
mlp_acc = MLPClassifier(**mlp_acc_params)
mlp_acc.fit(X, y)
preds = mlp_acc.predict(X_val)

# Calculating model scores:
mlp_acc_acc_score = accuracy_score(y_val, preds)
mlp_acc_f1_score = f1_score(y_val, preds)

# Optimising on f1 score
optimiser = Optimiser('f1')
mlp_f1_params = optimiser.optimise()
mlp_f1_params['random_state'] = random_state
mlp_f1 = MLPClassifier(**mlp_f1_params)
mlp_f1.fit(X, y)
preds = mlp_f1.predict(X_val)

# Calculating model scores:
mlp_f1_acc_score = accuracy_score(y_val, preds)
mlp_f1_f1_score = f1_score(y_val, preds)

In [None]:
# Print the output
print('Pre-optimised MLP classifier:')
print(' - accuracy:', f'{mlp_acc_score:.3f}')
print(' - f1-score:', f'{mlp_f1_score:.3f}' + '\n')

# Print the output:
print('Accuracy optimised MLP:')
print(' - accuracy:', f'{mlp_acc_acc_score:.3f}')
print(' - f1-score:', f'{mlp_acc_f1_score:.3f}' + '\n')

# Print the output:
print('F1 optimised MLP:')
print(' - accuracy:', f'{mlp_f1_acc_score:.3f}')
print(' - f1-score:', f'{mlp_f1_f1_score:.3f}'+ '\n')

## 4.9. Linear discriminant analysis

In [None]:
%%time

lda = LinearDiscriminantAnalysis()
lda.fit(X, y)
preds = lda.predict(X_val)

lda_acc_score = accuracy_score(y_val, preds)
lda_f1_score = f1_score(y_val, preds)

print('Linear discriminant analysis accuracy score:', f'{accuracy_score(y_val, preds):.3f}')
print('Linear discriminant analysis f1-score: ', f'{f1_score(y_val, preds):.3f}'+ '\n')

## 4.10. Support vector classifier

In [None]:
%%time

# Setting up the classifier:
svc = SVC(random_state=random_state, class_weight=class_weights, max_iter=2000)
svc.fit(X, y)
preds = svc.predict(X_val)

# Calculate the scores
svc_acc_score = accuracy_score(y_val, preds)
svc_f1_score = f1_score(y_val, preds)

# Print the scores
print('SVC classifier:')
print(' - accuracy:', f'{svc_acc_score:.3f}')
print(' - f1-score:', f'{svc_f1_score:.3f}' + '\n')

## 4.11. Bagging classifier

In [None]:
%%time

# Instantiate the classifier:
bc = BaggingClassifier(random_state=random_state)
bc.fit(X, y)
preds = bc.predict(X_val)

# Calculating the accuracy scores:
bc_acc_score = accuracy_score(y_val, preds)
bc_f1_score = f1_score(y_val, preds)

# Set the terms for the trials
def create_model(trial):
    n_estimators = trial.suggest_int('n_estimators', 2, 200)
    max_samples = trial.suggest_int('max_samples', 1, 100)
    model = BaggingClassifier(
        n_estimators=n_estimators, 
        max_samples=max_samples, 
        random_state=random_state
    )
    return model

# Optimise on accuracy
optimiser = Optimiser('acc')
bc_acc_params = optimiser.optimise()
bc_acc_params['random_state'] = random_state
bc_acc = BaggingClassifier(**bc_acc_params)
bc_acc.fit(X, y)
preds = bc_acc.predict(X_val)

# Calculating the accuracy scores:
bc_acc_acc_score = accuracy_score(y_val, preds)
bc_acc_f1_score = f1_score(y_val, preds)

# Optimise on f1 scores
optimiser = Optimiser('f1')
bc_f1_params = optimiser.optimise()
bc_f1_params['random_state'] = random_state
bc_f1 = BaggingClassifier(**bc_f1_params)
bc_f1.fit(X, y)
preds = bc_f1.predict(X_val)

# Calculating the accuracy scores:
bc_f1_acc_score = accuracy_score(y_val, preds)
bc_f1_f1_score = f1_score(y_val, preds)

In [None]:
# Print the scores
print('Pre-optimised bagging classifier:')
print(' - accuracy:', f'{bc_acc_score:.3f}')
print(' - f1-score:', f'{bc_f1_score:.3f}' + '\n')

# Print the output:
print('Accuracy optimised bagging classifier:')
print(' - accuracy:', f'{bc_acc_acc_score:.3f}')
print(' - f1-score:', f'{bc_acc_f1_score:.3f}' + '\n')
    
# Print the output:
print('F1 optimised bagging classifier:')
print(' - accuracy:', f'{bc_f1_acc_score:.3f}')
print(' - f1-score:', f'{bc_f1_f1_score:.3f}'+ '\n')

## 4.12. Gaussian NB

In [None]:
%%time

gnb = GaussianNB()
gnb.fit(X, y)
preds = gnb.predict(X_val)

gnb_acc_score = accuracy_score(y_val, preds)
gnb_f1_score = f1_score(y_val, preds)

print('Gaussian naive bayes analysis accuracy score:', f'{accuracy_score(y_val, preds):.3f}')
print('Gaussian naive bayes analysis f1-score: ', f'{f1_score(y_val, preds):.3f}'+ '\n')

## 4.13. K-Nearest Neighbours

In [None]:
%%time

# Instantiate the classifier:
knn = KNeighborsClassifier()
knn.fit(X, y)
preds = knn.predict(X_val)

# Calculating the accuracy scores:
knn_acc_score = accuracy_score(y_val, preds)
knn_f1_score = f1_score(y_val, preds)

# Set the terms for the trials
def create_model(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 2, 30, 2)
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    model = KNeighborsClassifier(
        n_neighbors=n_neighbors, 
        weights=weights
    )
    return model

# Optimise on accuracy
optimiser = Optimiser('acc')
knn_acc_params = optimiser.optimise()
knn_acc = KNeighborsClassifier(**knn_acc_params)
knn_acc.fit(X, y)
preds = knn_acc.predict(X_val)

# Calculating the accuracy scores:
knn_acc_acc_score = accuracy_score(y_val, preds)
knn_acc_f1_score = f1_score(y_val, preds)

# Optimise on f1 scores
optimiser = Optimiser('f1')
knn_f1_params = optimiser.optimise()
knn_f1 = KNeighborsClassifier(**knn_f1_params)
knn_f1.fit(X, y)
preds = knn_f1.predict(X_val)

# Calculating the accuracy scores:
knn_f1_acc_score = accuracy_score(y_val, preds)
knn_f1_f1_score = f1_score(y_val, preds)

In [None]:
# Print the scores
print('Pre-optimised bagging classifier:')
print(' - accuracy:', f'{knn_acc_score:.3f}')
print(' - f1-score:', f'{knn_f1_score:.3f}' + '\n')


# Print the output:
print('Accuracy optimised bagging classifier:')
print(' - accuracy:', f'{knn_acc_acc_score:.3f}')
print(' - f1-score:', f'{knn_acc_f1_score:.3f}' + '\n')

# Print the output:
print('F1 optimised bagging classifier:')
print(' - accuracy:', f'{knn_f1_acc_score:.3f}')
print(' - f1-score:', f'{knn_f1_f1_score:.3f}'+ '\n')

## 4.3. Summary of tuned models' performance
Now I will build a chart showing how the accuracy of the models compares.

In [None]:
acc_dict = {
    'GBC': gbc_acc_score, 'OAGBC': gbc_acc_acc_score, 'OFGBC': gbc_f1_acc_score,
    'LR': lr_acc_score,
    'LDA': lda_acc_score,
    'SVC': svc_acc_score,
    'BC': bc_acc_score, 'OABC': bc_acc_acc_score, 'OFBC': bc_f1_acc_score,
    'GNB': gnb_acc_score,
    'KNN': knn_acc_score, 'OAKNN': knn_acc_acc_score, 'OFKNN': knn_f1_acc_score,
    'MLP': mlp_acc_score, 'OAMLP': mlp_acc_acc_score, 'OFMLP': mlp_f1_acc_score,
    'LGBM': lgb_acc_score, 'OALGBM': lgb_acc_acc_score, 'OFLGBM': lgb_f1_acc_score,
    'XGB': xgb_acc_score, 'OAXGB': xgb_acc_acc_score, 'OFXGB': xgb_f1_acc_score,
    'CB': cb_acc_score,
}
tuned_models = pd.DataFrame.from_dict(acc_dict, orient='index')
tuned_models.rename({0: 'Accuracy'}, axis=1, inplace=True)

In [None]:
# Plotting the outputs
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(20,4))

# Setting the title
ax.set_title('Tuned model accuracy\n', fontsize=14, fontweight='bold')

# Setting y-axis range
ax.set_ylim((0.8,0.92))

# Setting label ticks size:
plt.yticks(fontsize=12)
plt.xticks(fontsize=12, rotation=45, horizontalalignment='right')

# Creating the bar plot
sns.barplot(x=tuned_models.index, y=tuned_models['Accuracy']);

# Creating the axis labels:
ax.set_xlabel('Model', fontproperties=font)
ax.set_ylabel('Accuracy', fontproperties=font)

# Adding the data label
add_value_labels(ax, fontsize=12, spacing=5, label_format="{:.3f}");  # Adds the values above the bars

# 5. Ensembling
We will now build an ensemble model using our base predictors above and a logistic regression meta model. 

In [None]:
# Set up our superlearner with folds 
ensemble = SuperLearner(folds=folds, random_state=random_state, 
                        shuffle=False, scorer = accuracy_score, sample_size=len(X))

# Add our models as the base layer models
ensemble.add([
    gbc, 
    svc, 
    bc,
    gnb,
    knn,
    mlp, 
    lgb,
    cb, 
    xgb,
])

# Add Catboost as the metalearner
ensemble.add_meta(LogisticRegression(max_iter=20000))

# Now to fit the model on the training data
ensemble.fit(X, y)

# Make the predictions using the validation data:
preds = ensemble.predict(X_val)

print('SuperLearner accuracy: ', f'{accuracy_score(y_val, preds): .3f}')
print('SuperLearner f1-score: ', f'{f1_score(y_val, preds): .3f}')

Optimising the SuperLearner:

In [None]:
mdict = {
    'GBC': GradientBoostingClassifier(random_state=random_state), 
    'OAGBC': GradientBoostingClassifier(**gbc_acc_params), 
    'OFGBC': GradientBoostingClassifier(**gbc_f1_params),
    'MLP': MLPClassifier(random_state=random_state, max_iter=2000), 
    'OAMLP': MLPClassifier(**mlp_acc_params), 
    'OFMLP': MLPClassifier(**mlp_f1_params),
    'LR': LogisticRegression(random_state=random_state,max_iter=2000),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(random_state=random_state, class_weight=class_weights), 
    'LGBM': LGBMClassifier(random_state=random_state), 
    'OALGBM': LGBMClassifier(**lgb_acc_params), 
    'OFLGBM': LGBMClassifier(**lgb_f1_params),
    'XGB': XGBClassifier(random_state=random_state,use_label_encoder=False), 
    'OAXGB': XGBClassifier(**xgb_acc_params), 
    'OFXGB': XGBClassifier(**xgb_f1_params),
    'CB': CatBoostClassifier(random_state=random_state, verbose=0, early_stopping_rounds=100), 
    'BC': BaggingClassifier(random_state=random_state), 
    'OABC': BaggingClassifier(**bc_acc_params), 
    'OFBC': BaggingClassifier(**bc_f1_params),
    'GNB': GaussianNB(),
    'KNN': KNeighborsClassifier(), 
    'OAKNN': KNeighborsClassifier(**knn_acc_params), 
    'OFKNN': KNeighborsClassifier(**knn_f1_params),
}

In [None]:
def create_model(trial):
    model_names = list()
    
    # List the models to use in the ensemble
    models_list = list(mdict.keys())
    
    # List the models to be used in the head
    head_list = list(mdict.keys())
    
    n_models = trial.suggest_int("n_models", 3, 15)
    for i in range(n_models):
        model_item = trial.suggest_categorical('model_{}'.format(i), models_list)
        if model_item not in model_names:
            model_names.append(model_item)
    
    # Instantiate the ensemble model:
    ensemble = SuperLearner(folds=folds, random_state=random_state, shuffle=False, 
                            scorer = accuracy_score, sample_size=len(X))
    
    # Add all the models to the ensemble:
    models = [mdict[item] for item in model_names]
    ensemble.add(models)
    
    # Cycle through possible 'heads'
    head = trial.suggest_categorical('head', head_list)
    ensemble.add_meta(mdict[head])
        
    return ensemble

def objective(trial):
    model = create_model(trial)
    model.fit(X, y)
    preds = model.predict(X_val)
    score = accuracy_score(y_val, preds)
    return score

Now for the hyperparameter tuning! This process is very slow, so go and make a cup of tea once you click 'run'...

In [None]:
%%time
# Creating our sampler
sampler = TPESampler(seed=random_state)

# creating the study:
study = optuna.create_study(direction="maximize", sampler=sampler)

# Optimising our study:
study.optimize(objective, n_trials=350, timeout=7200,
              )

In [None]:
# Define the best parameters of the study:
params = study.best_params
head = params['head']
del params['head'], params['n_models']
result = list()
for key, value in params.items():
    if value not in result:
        result.append(value)

print("Head model is: "+str(head))
print("Component models are: "+str(result))

In [None]:
# Instantiate our optimised ensemble model:
ensemble = SuperLearner(folds=folds, random_state=random_state)

# Add the models from the result above to the ensemble:
models = [mdict[item] for item in result]
ensemble.add(models)

# Add the meta model from the above:
ensemble.add_meta(mdict[head])

# Fit the optimised ensemble model:
ensemble.fit(X, y)

# Make the final predictions:
preds = ensemble.predict(X_val)

# Print our output:
print('Optimised SuperLearner accuracy: ', f'{accuracy_score(y_val, preds):.3f}')
print('Optimised SuperLearner f1-score: ', f'{f1_score(y_val, preds):.3f}')

# 6. Making a prediction

First, I need to analyse the test set to see if any data are missing.

In [None]:
test.isna().sum()

So I will need to fill the missing age data and alter the missing Cabin data as I did before. 

Now I need to change the test set to match the inputs to our model.

In [None]:
# Creating the ‘Title’ column
test['Title'] = test.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())

titles = ['Mr', 'Mrs', 'Miss', 'Master', 'Other']
test.loc[~test.Title.isin(titles), "Title"] = "Other"

# Dropping the name column
test.drop('Name', axis=1, inplace=True) 

# Create a cabin letter column and fill the unknowns with 'Unknown'
test['Cabin_let'] = test['Cabin'].str[0]
test.Cabin_let.fillna(value='Unknown', inplace=True)

# Now we can drop the old 'Cabin' column and change the name of the 'Cabin_let' column
test.drop('Cabin', axis=1, inplace=True)
test.rename(columns={'Cabin_let': 'Cabin'}, inplace=True)

# Change the tail of the cabin letters to 'Other'
let_to_replace = ['F','G','T']
replace_with = 'Other'

for i in let_to_replace:
    test.Cabin.loc[test['Cabin']==i] = replace_with

# Let's create a 'family' column to show who had children/parents/spouse onboard
test['Family'] = test.SibSp + test.Parch

# ...and kill the original columns
test.drop('SibSp', axis=1, inplace=True)
test.drop('Parch', axis=1, inplace=True)

# Creating the 'None' bucket
test.Family.loc[test['Family']==0] = 'None'

# Creating the 'Small_family' bucket
small_num_to_replace = [1,2,3]
small_fam = 'Small_family'
for i in small_num_to_replace:
    test.Family.loc[test['Family']==i] = small_fam
    
# Creating the 'Large_family' bucket
large_num_to_replace = [4,5,6,7,8,9,10]
large_fam = 'Large_family'
for i in large_num_to_replace:
    test.Family.loc[test['Family']==i] = large_fam

# Taking the log of the fare to reduce skewness:
test["Fare"].map(lambda i: np.log(i) if i > 0 else 0)

# Drop the ticket and Passenger ID
test.drop('Ticket', axis=1, inplace=True)
#test.drop('PassengerId', axis=1, inplace=True)

# Imputing the missing 'Fare' value
test['Fare'].fillna(test['Fare'].median(), inplace=True)

Filling in the missing 'Age' column.

In [None]:
# Let's split the dataframe based on whether the passenger's age is available
test_test = test.copy().loc[test['Age'].isin([np.nan])]
test_train_val = test.copy().loc[~test['Age'].isin([np.nan])]

# Now we need to define our independent variables
age_var = ['Pclass', 'Sex', 'Fare', 'Embarked', 'Title', 'Family', 'Cabin']

# And label the categorical features for Catboost
cat_features = ['Sex', 'Embarked', 'Cabin', 'Title', 'Family']

X = test_train_val[age_var]
y = test_train_val['Age']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialise data
pool_train = Pool(X_train, y_train, cat_features=cat_features)
pool_val = Pool(X_val, y_val, cat_features=cat_features)

# Initialise CatBoostRegressor
cb_model = CatBoostRegressor(iterations=1000,
                          learning_rate=0.03,
                          depth=6,
                          loss_function='RMSE',
                          eval_metric='RMSE',
                          cat_features=cat_features,
                          verbose=0,
                          use_best_model=True
                         )
# Fit model
cb_model.fit(pool_train, eval_set=pool_val, plot=False);

# Create the test set 
X_test = test_test[age_var]

# Make predictions
y_pred = cb_model.predict(X_test)

# Turn the predictions into a data frame
y_test = pd.DataFrame(y_pred, columns=['Age'])

# Add index as a colum
y_test['Index'] = X_test.index

# Set the index
y_test.set_index('Index', inplace=True)

# Put the predicted values back into the test df
test.fillna(y_test, axis=1, inplace=True)

In [None]:
# One hot encoding on the test set:
cat_features = ['Sex','Embarked','Title','Cabin','Family']
test = pd.get_dummies(test,columns=cat_features,prefix=cat_features)

In [None]:
var = ['Pclass','Age','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S',
       'Title_Master','Title_Miss','Title_Mr','Title_Mrs','Title_Other',
       'Cabin_A','Cabin_B','Cabin_C','Cabin_D','Cabin_E','Cabin_Other','Cabin_Unknown',
       'Family_Large_family','Family_None','Family_Small_family']

X_test = test[var]

In [None]:
# Now we need to scale the data in the same way that we did with the training data:
scale_data = True
shuffle = True # Needs to be turned off if we are scaling the data as the ensemble model fails to fit otherwise
scaler = MinMaxScaler()

# Scaling the data
if scale_data:
    X_test = scaler.fit_transform(X_test)
    X_test = pd.DataFrame(data=X_test, columns=var)

In [None]:
# Make the predictions 
predictions = ensemble.predict(X_test).astype(int)

In [None]:
# Creating our output:
output = pd.DataFrame({'PassengerId': test['PassengerId'],
                       'Survived': predictions})

In [None]:
# Creating the csv output file for submission
output.to_csv('ODP_submission.csv', index=False)

# 7. Conclusion 

This model returns a 78% accuracy in the predictions, which is about 6,000th in the top 28,000 submissions (including the people who have obviously cheated!). Which is a pretty good result! Feel free to copy/reuse elements of code from the above. 