# Loading packages and modules

In [None]:
# core packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

# feature engineering
from sklearn.cluster import KMeans

# preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
import umap
from sklearn.pipeline import Pipeline

# modelling
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier

# model evaluation
from sklearn.metrics import classification_report, fbeta_score, roc_auc_score

# hyperparameter optimization
import optuna

# understanding
from lightgbm import plot_importance, plot_metric
import shap

# Defining utility functions

In [None]:
# function to instantiate the model
def instantiate_model(model_hyperparameters):
    # creating the model instance with the respective hyperparameters
    model = LGBMClassifier(**model_hyperparameters)
    # returning the model
    return model


# function to instantiate a pipeline if the embedding is selected
def create_pipeline(try_embedding, embedding_comps, scaler_embedding):
    # creating a StandardScaler instance if this was the selected option, else use the MinMaxScaler
    if scaler_embedding:
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    
    # creating the selected embedding instance 
    if try_embedding == 'UMAP':
        embedding_instance = umap.UMAP(n_components = embedding_comps, random_state = 42)
    elif try_embedding == 'PCA':
        embedding_instance = PCA(n_components = embedding_comps, random_state = 42)
    else:
        embedding_instance = SparsePCA(n_components = embedding_comps, random_state = 42)
    
    # creating a pipeline instance
    pipeline_instance = Pipeline(steps = [('scaler', scaler), ('embedding', embedding_instance)])
    
    # returning the pipeline instance
    return pipeline_instance


# function to generate predictions
def generate_predictions(model, X_values):
    # predicting probabilities
    predicted_probabilities = model.predict_proba(X_values)
    # predicting class values
    predicted_classes = model.predict(X_values)
    # returning predictions
    return predicted_probabilities, predicted_classes


# function to calculate the desired metrics from the predictions
def extract_metrics(y_values, predicted_probabilities, predicted_classes, report = False):
    # printing the classification report if necessary
    if report:
        print('Classification report:')
        print(classification_report(y_true = y_values, y_pred = predicted_classes, zero_division = 0))
    
    # calculating the macro F1 score
    f_macro = fbeta_score(y_true = y_values, y_pred = predicted_classes, average = 'macro', beta = 1)
    
    # calculating the micro F1 score
    f_micro = fbeta_score(y_true = y_values, y_pred = predicted_classes, average = 'micro', beta = 1)
    
    # calculating the ROC AUC score for OVR
    auc_ovr = roc_auc_score(y_true = y_values, y_score = predicted_probabilities, multi_class = 'ovr', average = 'macro')
    
    # calculating the ROC AUC score for OVO
    auc_ovo = roc_auc_score(y_true = y_values, y_score = predicted_probabilities, multi_class = 'ovo', average = 'macro')
    
    # returning metrics
    return {'f1_macro': f_macro, 'f1_micro': f_micro, 'auc_ovr': auc_ovr, 'auc_ovo': auc_ovo}



# function to evaluate a stratified k-fold to the data given a set of hyperparameters
def evaluate_stratified_kfold(X, y, n_folds, hyperparameters, pruning, weighting, try_embedding, embedding_comps, scaler_embedding, use_kmeans_features, final_score = False, val_report = False):
    
    # creating a list to store the loss for each fold
    fold_losses = []
    
    # creating a list to store the auc one-vs-one for each fold
    ovo_losses = []
    
    # creating objects needed for the final fit
    if final_score:
        ## dictionary to store the models from each of the folds
        models = dict()
        ## dictionary to store each of the pipelines
        pipelines = dict()
        ## creating a list to add the predicted probabilities from each of the models
        summed_probabilities = np.zeros(shape = (X_test.shape[0], 4))
    
    # defining the weighting scheme
    ## if the selected weighting scheme is 0, turn it of
    if weighting == 0:
        weight_dictionary = {'class_weight': None}
    ## if the selected scheme is one, than this is close to the balanced choice
    elif weighting == 1:
        weight_dictionary = {'class_weight': 'balanced'}
    ## for all other values, downweight the weights by the selected ratio
    else:
        weight_dictionary = {'class_weight': {label: np.ceil(weight / weighting) for label, weight in default_weights.items()}}
    
    # updating the hyperparameter dictionary with the selected class_weights argument
    hyperparameters.update(weight_dictionary)
    
    # instantiating the K fold
    skf = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 42)
    
    # printing the hyperparameters
    print(f'\nThese are the hyperparameters that will be used across folds for this trial:\n{hyperparameters}')
    print(f'\nThese are the preprocessing options that will be used across folds:\ntry_embedding: {try_embedding}; embedding_comps: {embedding_comps}; scale_embedding: {scaler_embedding}; use_kmeans_features: {use_kmeans_features}')
    
    # running through each of the folds
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        # starting the loop
        print(f'\n--------------------------------------- Starting fold {fold + 1} ---------------------------------------')
        
        # creating a copy of the X and X_test dataframes
        X_trial = X.copy()
        X_test_trial = X_test.copy()
    
        # adding the kmeans features to the train and test dataframe if this option was selected
        if use_kmeans_features:
            X_trial['feature_kmeans'] = feature_kmeans_train
            X_test_trial['feature_kmeans'] = feature_kmeans_test
        
        # getting the train and validation data for this fold
        X_train_fold, X_val_fold, y_train_fold, y_val_fold = X_trial.iloc[train_idx], X_trial.iloc[val_idx], y[train_idx], y[val_idx]
        
        # training the embedding and transforming the data if this was selected
        if try_embedding is not None:
            # creating the pipeline for the preprocessing
            print('Creating the pipeline for applying the embedding.')
            pipeline = create_pipeline(try_embedding = try_embedding, embedding_comps = embedding_comps, scaler_embedding = scaler_embedding)
            # training the pipeline
            print('Training the pipeline with the embedding.')
            pipeline.fit(X_train_fold)
            # applying the embedding to the data
            print('Applying the embedding.')
            X_train_fold = pipeline.transform(X_train_fold)
            X_val_fold = pipeline.transform(X_val_fold)
        
        # creating a model instance
        clf = instantiate_model(model_hyperparameters = hyperparameters)
            
        # fitting the model
        clf.fit(X = X_train_fold, 
                y = y_train_fold, 
                eval_set = [(X_train_fold, y_train_fold), (X_val_fold, y_val_fold)], 
                eval_metric = 'multi_logloss', 
                early_stopping_rounds = 10, 
                verbose = False,
                #callbacks = [pruning] # only works if there is a single metric
               )
        
        # generating the predictions for the train and validation sets
        train_probas, train_classes = generate_predictions(model = clf, X_values = X_train_fold)
        val_probas, val_classes = generate_predictions(model = clf, X_values = X_val_fold)
        
        # extracting the metrics for the training and validation sets
        train_metrics = extract_metrics(y_values = y_train_fold, predicted_probabilities = train_probas, predicted_classes = train_classes, report = False)
        val_metrics = extract_metrics(y_values = y_val_fold, predicted_probabilities = val_probas, predicted_classes = val_classes, report = val_report)
        
        # extracting the fold loss
        fold_losses.append(clf.best_score_['valid_1']['multi_logloss'])
        
        # extracting the auc one-vs-one values
        ovo_losses.append(val_metrics["auc_ovo"])
        
        # extracting all necessary information if the choice is the final fit
        if final_score:
            ## saving the model in the dictionary
            models[f'model_{fold + 1}'] = clf
            if try_embedding is not None:
                ## applying the pipeline to the test data
                X_test_trial = pipeline.transform(X_test_trial)
                ## saving the pipeline
                pipelines[f'pipeline_{fold + 1}'] = pipeline
            ## generating predictions on the test set
            test_probas, _ = generate_predictions(model = clf, X_values = X_test_trial)
            # summing the test dataset probabilities
            summed_probabilities = summed_probabilities + test_probas
            
        # printing the fold results
        print(f'Loss (Training|Validation)       : {np.round(clf.best_score_["training"]["multi_logloss"], 5)} | {np.round(clf.best_score_["valid_1"]["multi_logloss"], 5)}')
        print(f'F1 Macro (Training|Validation)   : {np.round(train_metrics["f1_macro"], 5)} | {np.round(val_metrics["f1_macro"], 5)}')
        print(f'AUC OvO (Training|Validation)    : {np.round(train_metrics["auc_ovo"], 5)} | {np.round(val_metrics["auc_ovo"], 5)}')
    
    # printing the results obtained for this function evaluation
    print(f'\nEnd of Training!\nAverage loss: {np.mean(fold_losses)} | Average AUC OvO: {np.mean(ovo_losses)}')
    print('-----------------------------------------------------------------------------------------------\n')
    # returning the results
    ## scores and model if this is for the final fit
    if final_score:
        if try_embedding is not None:
            return (np.mean(fold_losses), np.mean(ovo_losses), models, pipelines, summed_probabilities)
        else:
            return (np.mean(fold_losses), np.mean(ovo_losses), models, summed_probabilities)
    ## else, only the scores
    else:
        return (np.mean(fold_losses), np.mean(ovo_losses))

    
# function to perform the hyperparameter optimization
def objective(trial):
    
    ############################################## HYPERPARAMETER SELECTION ##############################################
    
    # setting up the weighting scheme for each instance
    weighting_scheme = trial.suggest_int('weighting_scheme', 0, 6)
    
    # defining the search space of model hyperparameters
    trial_hyperparameters = {
        'boosting_type'            : trial.suggest_categorical('boosting_type', ['gbdt', 'goss']),
        'num_leaves'               : trial.suggest_int('num_leaves', 3, 16),
        'max_depth'                : trial.suggest_int('max_depth', 3, 16),
        'learning_rate'            : trial.suggest_float('learning_rate', 0.001, 0.2),
        'n_estimators'             : trial.suggest_int('n_estimators', 50, 1500),
        'min_child_samples'        : trial.suggest_int('min_child_samples', 20, 1000),
        'min_child_weight'         : trial.suggest_float('min_child_weight', 1e-4, 1e-2),
        'colsample_bytree'         : trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'colsample_bynode'         : trial.suggest_float('colsample_bynode', 0.4, 1.0),
        'extra_trees'              : trial.suggest_categorical('extra_trees', [False, True]),
        'reg_alpha'                : trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda'               : trial.suggest_float('reg_lambda', 0.0, 10.0)
    }
    
    
    # adding further hyperparameters to the search space if the boosting type is goss
    if trial_hyperparameters['boosting_type'] == 'goss':
        # defining the hyperparameters specific to the gooss boosting
        goss_hyperparameters = {'top_rate'      : trial.suggest_float('top_rate', 0.1, 0.4),
                                'other_rate'    : trial.suggest_float('other_rate', 0.05, 0.4)}
        # updating the trial hyperparameters dictionary with those coming from goss
        trial_hyperparameters.update(goss_hyperparameters)
    
    
    # adding bagging if boosting is not goos
    if trial_hyperparameters['boosting_type'] != 'goss':
        # defining the hyperparameters specific to the gooss boosting
        additional_hyperparameters = {'subsample'       : trial.suggest_float('subsample', 0.4, 1.0),
                                      'subsample_freq'  : trial.suggest_int('subsample_freq', 0, 50)}
        # updating the trial hyperparameters dictionary with those coming from goss
        trial_hyperparameters.update(additional_hyperparameters)
    

    # defining the standard hyperparameter dictionary
    standard_hyperparameters = {'objective'   : 'multiclass',
                                'metric'      : 'multi_logloss',
                                'num_class'   : 4, 
                                'random_state': 42,
                                'silent'      : True,
                                'verbosity'   : -1
                               }
    
    # updating the trial hyperparameter dictionary
    trial_hyperparameters.update(standard_hyperparameters)
    
    ############################################### PREPROCESSING SELECTION ##############################################
    
    # defining whether we are using an embedding layer
    use_embedding = trial.suggest_categorical('use_embedding', [None, 'UMAP', 'PCA', 'SparsePCA'])
    
    # defining the number of components in the embedding layer if it is chosen as well as the scaler
    if use_embedding is not None:
        embedding_size = trial.suggest_int('embedding_size', 2, 15)
        use_std_scaler = trial.suggest_categorical('use_std_scaler', [False, True])
        
    # defining whether to use the feature from the kmeans clustering if the embedding is not used
    if use_embedding is None:
        use_kmeans_features = trial.suggest_categorical('use_kmeans_features', [False, True])
    
    ################################################# CALLBACK DEFINITION ################################################
    
    # instantiating a prunning call back - won't have an effect here as we are optimizing more than one metric
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, metric = 'multi_logloss', valid_name = 'valid_1')
    
    ##################################################### MODEL FIT ######################################################
    
    # fitting the model through stratified k-fold
    trial_loss, trial_auc = evaluate_stratified_kfold(X, y, n_folds = K, hyperparameters = trial_hyperparameters,
                                                      weighting = weighting_scheme,
                                                      pruning = pruning_callback, 
                                                      final_score = False,
                                                      val_report = False,
                                                      try_embedding = use_embedding,
                                                      embedding_comps = None if use_embedding is None else embedding_size,
                                                      scaler_embedding = None if use_embedding is None else use_std_scaler,
                                                      use_kmeans_features = use_kmeans_features if use_embedding is None else False)
    
    # returning the final trial scores
    return trial_auc, trial_loss

# Loading the data

In [None]:
# training data
train = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-may-2021/train.csv')

# testing data
test = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-may-2021/test.csv')

# sample submission data
submission = pd.read_csv(filepath_or_buffer = '../input/tabular-playground-series-may-2021/sample_submission.csv')

# printing the shape of the data
print(f'Train data shape: {train.shape}')
print(f'Test data shape: {test.shape}')
print(f'Sample submission data shape: {submission.shape}')

# Exploring the data

Features with values below 0.

In [None]:
# feature column names
feature_columns = [column for column in train.columns if 'feature' in column]

# features where the minimum is below 0
train_below = [feature for feature in feature_columns if train[feature].min() < 0]
test_below = [feature for feature in feature_columns if test[feature].min() < 0]

# number of instances below zero for each column in the training set
## creating a dictionary to store results
dict_train_below = dict()
## loop over the columns of the train set
for column in train_below:
    # counting instances from each column
    counts = dict(train[column].value_counts())
    dict_train_below[column] = {k: v for k, v in counts.items() if k < 0}
    
# number of instances below zero for each column in the test set
## creating a dictionary to store results
dict_test_below = dict()
## loop over the columns of the train set
for column in test_below:
    # counting instances from each column
    counts = dict(test[column].value_counts())
    dict_test_below[column] = {k: v for k, v in counts.items() if k < 0}
    
# printing the number of instances below 0 for each dataset
print(f'Number of instances below zero for each of the columns in the train set:\n{dict_train_below}.')
print(f'\nNumber of instances below zero for each of the columns in the test set:\n{dict_test_below}.')

# Preparing the data

Separating the target from the features.

In [None]:
# getting the feature columns
## copying the original dataframe
X = train.copy()
## extracting the columns
X = X[feature_columns]

# getting the target
target = train.target

Encoding the target.

In [None]:
# instantiating the label encoder
le = LabelEncoder()
# fitting the label encoder
y = le.fit_transform(target)

Setting up the default weighting scheme.

In [None]:
# unpacking the counts for each of the labels
labels, counts = np.unique(y, return_counts = True)
# setting a weighting scheme as the ratio between the maximum count and each count
inverse_weights = [np.ceil(np.max(counts) / count) for count in counts]
# creating a default weight dictionary for each label
default_weights = dict(zip(labels, inverse_weights))

Getting the test data.

In [None]:
# copying the dataframe
X_test = test.copy()

# selecting the columns
X_test = X_test[feature_columns]

# Feature Engineering

Creating a feature based on KMeans.

In [None]:
# creating a scaler to fit to the data
scaler_cluster = StandardScaler()

# fitting the scaler to the data
X_scaled_cluster = scaler_cluster.fit_transform(X)

# creating a list to store the inertia
sse = []

# setting the clusters values to be tried
cluster_values = [20, 30] + list(range(33, 80, 3))

# lopping over the number of possible clusters
for n_cluster in cluster_values:
    # creating a kmeans instance
    kmeans = KMeans(n_clusters = n_cluster)
    # fitting the kmeans to the data
    kmeans.fit(X_scaled_cluster)
    # extracting the inertial
    sse.append(kmeans.inertia_)
    
# plotting the results of the kmeans
plt.plot(cluster_values, sse)
plt.xticks(cluster_values)
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.show()

In [None]:
# creating a pipeline to fit kmeans to the data
kmeans_pipeline = Pipeline(steps = [('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters = 57, random_state = 42))])

# training the kmeans clustering
kmeans_pipeline.fit(X)

# creating the feature based on the kmeans
## for the training data
feature_kmeans_train = kmeans_pipeline.predict(X)
## for the test data
feature_kmeans_test = kmeans_pipeline.predict(X_test)

# Hyperparameter Tunning

Subsetting best features (from version 5 of this notebook).

In [None]:
# features with most votes
feature_subset = ['feature_38', 'feature_6', 'feature_14', 'feature_15', 'feature_28', 'feature_31', 'feature_34', 'feature_24', 'feature_9', 'feature_11', 
                  'feature_12', 'feature_18', 'feature_16', 'feature_23', 'feature_25', 'feature_37', 'feature_35']
## features that were also selected: feature_2, feature_1, feature_19, feature_22, feature_42, feature_48, feature_33, feature_7, feature_10, feature_17, feature_20, feature_43, feature_46

# subsetting both dataframe
X = X[feature_subset]
X_test = X_test[feature_subset]

In [None]:
# defining the number of splits
K = 5

In [None]:
# creating a study
study = optuna.create_study(directions = ['maximize', 'minimize'], pruner = optuna.pruners.MedianPruner())

In [None]:
# running the study
study.optimize(func = objective, n_trials = 100, timeout = 60 * 60 * 7)

In [None]:
# best trial for each combination of metrics
optuna.visualization.plot_pareto_front(study, target_names = ['AUC (One vs. One)', 'Multi Logloss'])

In [None]:
# best hyperparameters for multi_logloss
optuna.visualization.plot_param_importances(study, target = lambda x: x.values[1], target_name = 'Multi Logloss')

In [None]:
# best hyperparameters for multi_logloss
optuna.visualization.plot_param_importances(study, target = lambda x: x.values[0], target_name = 'AUC (One vs One)')

In [None]:
# trajectories for each hyperparameter combination for multi logloss
optuna.visualization.plot_parallel_coordinate(study, target = lambda x: x.values[1], target_name = 'Multi Logloss')

In [None]:
# trajectories for each hyperparameter combination for AUC (one vs one)
optuna.visualization.plot_parallel_coordinate(study, target = lambda x: x.values[0], target_name = 'AUC (One vs One)')

# Training the tuned model for submission 

In [None]:
# extracting the best hyperparameter combination
best_hyperparameters = study.best_trials[0].params
print(f'The selected hyperparameter combination was:\n{best_hyperparameters}')

In [None]:
# unpacking best hyperparameters and removing the keys that are not needed in this dictionary
## extracting the weighting scheme
weighting_scheme = best_hyperparameters['weighting_scheme']
## deleting this key
best_hyperparameters.pop('weighting_scheme')

## extracting the embedding key
use_embedding = best_hyperparameters['use_embedding']
## deleting this key
best_hyperparameters.pop('use_embedding')

## extracting the keys associated to the embedding
if use_embedding is not None:
    ## getting the embedding dimensions
    embedding_size = best_hyperparameters['embedding_size']
    best_hyperparameters.pop('embedding_size')
    ## getting the scaler type
    use_std_scaler = best_hyperparameters['use_std_scaler']
    best_hyperparameters.pop('use_std_scaler')
    
## extracting the keys associated to the kmeans features
if use_embedding is None:
    ## getting the embedding dimensions
    use_kmeans_features = best_hyperparameters['use_kmeans_features']
    best_hyperparameters.pop('use_kmeans_features')

In [None]:
## fitting the model depending on the embedding
if use_embedding is None:
    # fitting the model with the selected hyperparameters and configurations
    loss, auc_ovo, models, predictions = evaluate_stratified_kfold(X = X, y = y, n_folds = K, hyperparameters = best_hyperparameters,
                                                                   pruning = None, final_score = True, val_report = True,
                                                                   weighting = weighting_scheme,
                                                                   try_embedding = use_embedding,
                                                                   embedding_comps = None if use_embedding is None else embedding_size,
                                                                   scaler_embedding = None if use_embedding is None else use_std_scaler,
                                                                   use_kmeans_features = use_kmeans_features if use_embedding is None else False)
else:
    # fitting the model with the selected hyperparameters and configurations
    loss, auc_ovo, models, pipelines, predictions = evaluate_stratified_kfold(X = X, y = y, n_folds = K, hyperparameters = best_hyperparameters,
                                                                              pruning = None, final_score = True, val_report = True,
                                                                              weighting = weighting_scheme,
                                                                              try_embedding = use_embedding,
                                                                              embedding_comps = None if use_embedding is None else embedding_size,
                                                                              scaler_embedding = None if use_embedding is None else use_std_scaler,
                                                                              use_kmeans_features = use_kmeans_features if use_embedding is None else False)

In [None]:
# feature importance for each of the models
## for when the embedding is selected
if use_embedding is not None:
    for pipeline_id, model_id in zip(pipelines.keys(), models.keys()):
        # instantiating the explainer for the model
        explainer = shap.TreeExplainer(model = models[model_id])
        # transforming the data
        X_shap = pipelines[pipeline_id].transform(X)
        # extracting the shap values
        shap_values = explainer.shap_values(X_shap)
        # plotting the model
        shap.summary_plot(shap_values, X_shap)
## when the embedding shouldn't be used
else:  
    # feature importance for each of the models
    for model_id in models.keys():
        # instantiating the explainer for the model
        explainer = shap.TreeExplainer(model = models[model_id])
        # creating a copy of the dataframe
        X_shap = X.copy()
        # adding the kmeans feature if it is needed
        if use_kmeans_features:
            X_shap['feature_kmeans'] = feature_kmeans_train
        # extracting the shap values
        shap_values = explainer.shap_values(X_shap)
        # plotting the model
        shap.summary_plot(shap_values, X_shap)

In [None]:
# lightgbm built-in feature importance
for model_id in models.keys():
    plot_importance(models[model_id], figsize = (5, 6), max_num_features = 15, title = f'Feature Importance for model: {model_id}')

In [None]:
# lightgbm training history
for model_id in models.keys():
    plot_metric(models[model_id], title = f'Training history for model: {model_id}')

# Inputing the probabilities to the submission file

In [None]:
# putting the predictions into their respective columns
submission[['Class_1', 'Class_2', 'Class_3', 'Class_4']] = predictions / K

# Saving the submission file

In [None]:
submission.to_csv(path_or_buf = 'submission.csv', index = False)