In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.float_format = '{:,.2f}'.format
import os

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn import metrics
from scipy.fft import rfft

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 0. Helper functions <a name="help"></a>

In [None]:
def fit_model_using_classifier(alg,
                               dtrain,
                               predictors,
                               target="state",
                               performCV=True, 
                               printFeatureImportance=True, 
                               cv_folds=3,
                               repeat=5,
                               scoring='roc_auc',
                               only_top_x_feature=60
                              ):
    """
    I used the function found in this source
    https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    I modified the code slightly
    """

    # Perform cross-validation:
    cv_score=list()
    if performCV:
        for i in range(0,repeat):
            cv_score_temp = cross_val_score(
                            alg, 
                            dtrain[predictors], 
                            dtrain[target], 
                            cv=cv_folds, 
                            scoring=scoring)
            cv_score=cv_score+list(cv_score_temp)
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]        
    
    # Print model report:
    print("\nModel Report")
    print("Accuracy : " + str(round(metrics.accuracy_score(
        dtrain[target].values, dtrain_predictions),4)))
    print("AUC Score (Train): " + str(round(
        metrics.roc_auc_score(dtrain[target], dtrain_predprob),4)))
    
    if performCV:
        print("\n Cross validation summary ("+scoring+")")
        print("Average: "+str(round(np.mean(cv_score),4)))
        print("Std    : "+str(round(np.std(cv_score),4)))
        print("Min    : "+str(round(np.min(cv_score),4)))
        print("Max    : "+str(round(np.max(cv_score),4)))
                
    # Print Feature Importance:
    if printFeatureImportance and "feature_importances_" in dir(alg):
        plt.figure(figsize=(20,6))
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.head(only_top_x_feature).plot(kind='bar', title='Feature Importances',fontsize=12, color="#CBC3E3")
        plt.ylabel('Feature Importance Score')
        return alg, feat_imp
    else:
        return alg, list()

def generate_features(df, metric_data, group_variables, sensor_identifiers, suffix=""):
    """
    Generates the features based on the provided metric_data map
    
    """
    all_metrics=pd.DataFrame(columns=group_variables)
    for sensor_number in sensor_numbs:
        sensor_v="sensor_"+sensor_number
        # I had to use this words solution because of list formatting
        metrics = [ listv[0] 
                        for key, listv in metric_data.items()]
        metric_cols=[key+sensor_number+suffix for key in metric_data.keys()]

        temp_metrics=df.groupby(group_variables).agg({
            sensor_v: metrics
        }).reset_index()
        temp_metrics.columns=group_variables+metric_cols
        all_metrics=all_metrics.merge(temp_metrics,how="outer",on=group_variables)

    # finally we save down the variable names as well
    generated_columns=list(set(all_metrics.columns)-set(group_variables))
    generated_columns.sort()
    return all_metrics, generated_columns

def create_frequencies(groups):
    """
    Create frequencies up to frequency 30.
    source https://www.kaggle.com/code/matanivanov/lgbm-with-fourier-transform
    """
    return pd.concat(
        [pd.Series(np.abs(rfft(groups[col].values)), 
                   index=[f'{col}_freq_{i}' for i in range(31)]) 
         for col in groups.columns if col not in ['sequence', 'subject', 'step']
        ])

def fit_ensemble(models, X_train, X_val, y_train, y_val, soft_vote=True):
    """
    Fit all models on the training set and predict on hold out set
    """
    meta_X = list()
    if X_val is None:
        X_val=X_train
    if y_val is None:
        y_val=y_train
    
    for name, model in models:
        model.fit(X_train, y_train)
        if soft_vote:
            yhat = model.predict_proba(X_val)[:,1]
        else:
            yhat = model.predict(X_val)
        yhat2 = yhat.reshape(len(yhat), 1)
        meta_X.append(yhat2)
        del yhat
    meta_X = np.hstack(meta_X)
    blender = LogisticRegression()
    blender.fit(meta_X, y_val)
    return blender, meta_X

def predict_ensemble(models, blender, X_test, soft_vote=True):
    """
    Predict outcome using the set of models
    """
    meta_X = list()
    for name, model in models:
        if soft_vote:
            yhat = model.predict_proba(X_test)[:,1]
        else:
            yhat = model.predict(X_test)
        yhat2 = yhat.reshape(len(yhat), 1)
        del yhat
        meta_X.append(yhat2)
    meta_X = np.hstack(meta_X)
    return blender.predict_proba(meta_X)[:,1]

# Table of contents
0. [Helper functions](#help)
1. [Load and explore data](#introduction)
2. [Variable construction](#vars)
3. [EDA for the created features & selection](#vars2)
4. [Estimate models](#modest)
5. [Ensemble approach](#ens)
6. [Acknowledgement](#ack)

# 1. Load and Explore data <a name="introduction"></a>

Let us keep the data and columns descriptions in mind
* train.csv - the training set, comprising ~26,000 60-second recordings of thirteen biological sensors for almost one thousand experimental participants
> 1. sequence - a unique id for each sequence
> 2. subject - a unique id for the subject in the experiment
> 3. step - time step of the recording, in one second intervals
> 4. sensor_00 - sensor_12 - the value for each of the thirteen sensors at that time step
* train_labels.csv - the class label for each sequence.
> 1. sequence - the unique id for each sequence.
> 2. state - the state associated to each sequence. This is the target which you are trying to predict.
* test.csv - the test set. For each of the ~12,000 sequences, you should predict a value for that sequence's state.
* sample_submission.csv - a sample submission file in the correct format.

In [None]:
train_labels=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv")
display(train_labels.head())

In [None]:
train=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train.csv")
display(train.head())
display(train.describe())

In [None]:
# At first glance no problem with missing variables
display(train.info())

In [None]:
test=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/test.csv")
display(test.head())
display(test.describe())

In [None]:
# At first glance no problem with missing variables
display(test.info())

In [None]:
# let us define a list of sensors for convinience
sensors=list(test.columns[3:16])
sensor_numbs=[sensor[7:9] for sensor in sensors]

In [None]:
print("Number if subjects " + str(len(train["subject"].unique())))
print("Number if sequences " +str(len(train["sequence"].unique())))

In [None]:
train_subject_check=train_labels.merge(train[["subject","sequence"]].copy().drop_duplicates(),how="left",on="sequence")
state_by_subject=train_subject_check.groupby("subject")[["state"]].mean().reset_index()
state_by_subject.sort_values(by=["state"],inplace=True)
display(state_by_subject.head(10))
display(state_by_subject.tail(10))
# There is correlation in subjects results, we might want to extract this later...

In [None]:
# temporary...
# train.head()

In [None]:
# import random
# subjects_list=sorted(set(train.subject))
# selected_subject=random.choices(subjects_list, k=50)

In [None]:
# train=train[train["subject"].isin(selected_subject)].copy()

In [None]:
# subjects_list_test=sorted(set(test.subject))
# selected_subject_test=random.choices(subjects_list_test, k=30)

In [None]:
# test=test[test["subject"].isin(selected_subject_test)].copy()

# 2. Variable construction <a name="vars"></a>

<p style="font-size:17px"> 
Sequence is described by multiple measurements using 13 sensors</p>
<p style="font-size:17px"> There is also a subject column, which indicates that subjects were involved in multiple measurements </p>

<p style="font-size:17px"> We need a methodology that is capable to generate features both on subject and on subject&sequence level</p>

In [None]:
%%time
train_freq = train.sort_values(['subject', 'sequence', 'step']).groupby(['sequence', 'subject']).apply(create_frequencies)
train_freq.reset_index(inplace=True)

test_freq = test.sort_values(['subject', 'sequence', 'step']).groupby(['sequence', 'subject']).apply(create_frequencies)
test_freq.reset_index(inplace=True)
freq_columns=list(train_freq.columns[2:])

In [None]:
test_freq.head()

In [None]:
metric_data_short={
    "mean_": [np.nanmean],
    "std_": [np.nanstd]
}
# mean and std achieves around 80-82% f1 score

# lets extend these with new metrics
def auto_corr_1(x):
    return np.corrcoef(x[1:],x[:-1])[0,1]
def auto_corr_2(x):
    return np.corrcoef(x[2:],x[:-2])[0,1]

def p5(x):
    return np.percentile(x,5)
def p10(x):
    return np.percentile(x,10)
def p25(x):
    return np.percentile(x,25)
def p75(x):
    return np.percentile(x,75)
def p90(x):
    return np.percentile(x,90)
def p95(x):
    return np.percentile(x,95)


def mean_diff(x):
    dd=x.diff().dropna()
    return np.nanmean(dd)

def std_diff(x):
    dd=x.diff().dropna()
    return np.nanstd(dd)

def auto_corr_1_diff(x):
    dd=x.diff().dropna()
    return np.corrcoef(dd[1:],dd[:-1])[0,1]

def auto_corr_2_diff(x):
    dd=x.diff().dropna()
    return np.corrcoef(dd[2:],dd[:-2])[0,1]

def skew_diff(x):
    dd=x.diff().dropna()
    return skew(dd)

def kurtosis_diff(x):
    dd=x.diff().dropna()
    return kurtosis(dd)

def dp5(x):
    dd=x.diff().dropna()
    return np.percentile(dd,5)
def dp10(x):
    dd=x.diff().dropna()
    return np.percentile(dd,10)
def dp25(x):
    dd=x.diff().dropna()
    return np.percentile(dd,25)
def dp75(x):
    dd=x.diff().dropna()
    return np.percentile(dd,75)
def dp90(x):
    dd=x.diff().dropna()
    return np.percentile(dd,90)
def dp95(x):
    dd=x.diff().dropna()
    return np.percentile(dd,95)


metric_data={
    "mean_": [np.nanmean],
    "std_": [np.nanstd],
    "median_": [np.median],
    "p05_": [p5],
    "p10_": [p10],
    "p25_": [p25],
    "p75_": [p75],
    "p90_": [p90],
    "p95_": [p95],
    "min_": [np.nanmin],
    "max_": [np.nanmax],
    "skew_": [skew],
    "kurtosis_": [kurtosis],
    "corr1_": [auto_corr_1],
    "corr2_": [auto_corr_2],
    "d_mean_": [mean_diff],
    "d_std_": [std_diff],
    "d_corr1_": [auto_corr_1_diff],
    "d_corr2_": [auto_corr_2_diff],
    "d_skew_": [skew_diff],
    "d_kurtosis_": [kurtosis_diff],
    "d_p05_": [dp5],
    "d_p10_": [dp10],
    "d_p25_": [dp25],
    "d_p75_": [dp75],
    "d_p90_": [dp90],
    "d_p95_": [dp95]
}

metric_data_subj={
    "mean_": [np.nanmean],
    "std_": [np.nanstd],
    "median_": [np.median],
    "p05_": [p5],
    "p10_": [p10],
    "p25_": [p25],
    "p75_": [p75],
    "p90_": [p90],
    "p95_": [p95],
    "min_": [np.nanmin],
    "max_": [np.nanmax],
    "skew_": [skew],
    "kurtosis_": [kurtosis]
}


In [None]:
print(len(train))

In [None]:
%%time
    train_features, gen_col_train = generate_features(df=train, 
                                                      metric_data=metric_data, 
                                                      group_variables=["sequence"], 
                                                      sensor_identifiers=sensor_numbs, 
                                                      suffix="")
    test_features, gen_col_test = generate_features(df=test, 
                                                    metric_data=metric_data, 
                                                    group_variables=["sequence"], 
                                                    sensor_identifiers=sensor_numbs, 
                                                    suffix="")

    train_features_s, gen_col_train_s = generate_features(df=train, 
                                                      metric_data=metric_data_subj, 
                                                      group_variables=["subject"], 
                                                      sensor_identifiers=sensor_numbs, 
                                                      suffix="_subj")
    test_features_s, gen_col_test_s = generate_features(df=test, 
                                                    metric_data=metric_data_subj, 
                                                    group_variables=["subject"], 
                                                    sensor_identifiers=sensor_numbs, 
                                                    suffix="_subj")

In [None]:
print(len(train))

In [None]:
# number of sequences per subject
train_subj_len=train[["sequence","subject"]].groupby("subject").count().reset_index()
train_subj_len.columns=["subject","sequence_len"]
test_subj_len=test[["sequence","subject"]].groupby("subject").count().reset_index()
test_subj_len.columns=["subject","sequence_len"]

In [None]:
# merge the tables together for train and test
train_feature_final=train_labels.merge(train[["subject","sequence"]].copy().drop_duplicates(),how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_features,how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_features_s,how="left",on="subject")
train_feature_final=train_feature_final.merge(train_subj_len,how="left",on="subject")

train_feature_final=train_feature_final.merge(train_freq,how="left",on=['sequence', 'subject'])

explanatory_variables=gen_col_train+gen_col_train_s+["sequence_len"]+freq_columns

test_feature_final=test[["subject","sequence"]].copy().drop_duplicates()
test_feature_final=test_feature_final.merge(test_features,how="left",on="sequence")
test_feature_final=test_feature_final.merge(test_features_s,how="left",on="subject")
test_feature_final=test_feature_final.merge(test_subj_len,how="left",on="subject")
test_feature_final=test_feature_final.merge(test_freq,how="left",on=['sequence', 'subject'])

In [None]:
test_feature_final.fillna(0,inplace=True)
train_feature_final.fillna(0,inplace=True)

In [None]:
# # Transform train and test to 0-1 scale
scaler = MinMaxScaler()
train_feature_final.loc[:,explanatory_variables]=scaler.fit_transform(train_feature_final.loc[:,explanatory_variables])
test_feature_final.loc[:,explanatory_variables]=scaler.transform(test_feature_final.loc[:,explanatory_variables])

In [None]:
# we have skewed distributions, we apply a sqrt functional form which translates the distribution to a least skewed one.
for var in explanatory_variables:
    if (train_feature_final[var].skew()) > 3 and not train_feature_final[var].min()<0.0:
        train_feature_final[var]=np.sqrt(train_feature_final[var])
        test_feature_final[var]=np.sqrt(test_feature_final[var])
        
test_feature_final.fillna(0,inplace=True)
train_feature_final.fillna(0,inplace=True)

# 3. EDA for the created features  <a name="vars2"></a>

In [None]:
selected_features=["kurtosis_04", 
                   "sequence_len",
                   "std_02",
                   "kurtosis_10",
                   "sensor_09_freq_0",
                   "sensor_09_freq_1",
                   "sensor_01_freq_0",
                   "sensor_02_freq_2",
                   "p05_09",
                   "max_05",
                   "p25_10",
                   "p10_04"]

In [None]:
for feature in selected_features:
    plt.figure(figsize=(20,6))
    plt.hist(train_feature_final[train_feature_final["state"]<1][feature],bins=200, density=True, label='State : 0',color='#CBC3E3')
    plt.hist(train_feature_final[train_feature_final["state"]>0][feature],bins=200, density=True, label='State : 1',color='#F4B123', alpha = 0.5)
    plt.ylabel('Frequency')
    plt.title('Distribution of values for feature: '+feature, fontsize=15)
    #plt.gca().yaxis.set_major_formatter(PercentFormatter(xmax=1, decimals=1))
    plt.legend()
    plt.show()

# I deploy a simple selection procedure using a logit regression as a basis.

In [None]:
%%time
log00 =LogisticRegression(random_state=42,max_iter=12000, C=1.6)
# here I increased iteration number from the low default value to avoid warnings
# regularization param, arbitrarily decreased to respect large number of variables (default C = 1.0)
log00, feat_imp=fit_model_using_classifier(log00, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=5,
                                          scoring="roc_auc")

In [None]:
%%time
perm_result = permutation_importance(log00, 
                                     X=train_feature_final[explanatory_variables],
                                     y=train_feature_final["state"], 
                                     n_repeats=20,
                                     scoring="roc_auc",
                                     random_state=42)

res_select=pd.DataFrame({
    "variable": explanatory_variables,
    "importances_mean": perm_result.importances_mean*100,
    "importances_std": perm_result.importances_std
})
res_select.sort_values(by=["importances_mean"],inplace=True,ascending=False)

In [None]:
super_short_list=list(res_select[res_select["importances_mean"]>1]["variable"])
short_list=list(res_select[res_select["importances_mean"]>0.1]["variable"])
longer_list=list(res_select[res_select["importances_mean"]>0.05]["variable"])

print("Most important features")
print(super_short_list)

print("Original number of features: "+str(len(explanatory_variables)))
print("Short list number of features: "+str(len(longer_list)))

In [None]:
# I set the variable list to the identified longer list
# We have way to many variables by default more than 900, so we need to scale down.
# On the other hand I think some lesser important vars still have an impact so I keep those, try to keep those with the longer list.
explanatory_variables=longer_list.copy()

# 4. Estimate models  <a name="modest"></a>

In [None]:
scoref="roc_auc"
repeat_numb=3

In [None]:
%%time
gbm0 =GradientBoostingClassifier(random_state=42)
gbm0, feat_imp=fit_model_using_classifier(gbm0, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=repeat_numb,
                                          scoring=scoref)

In [None]:
xgb_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": gbm0.predict_proba(test_feature_final[explanatory_variables])[:,1]})
xgb_model_submission.to_csv("xgb_model_submission.csv",index=False)

In [None]:
test_feature_final.tail()

In [None]:
%%time
ext0 =ExtraTreesClassifier(random_state=42,min_samples_split=100,min_samples_leaf=50, n_jobs=-1)
# added arbitrary number for min_sample_split, min_samples_leaf to avoid (super) overfitting... (I got f1=1 insample with default settings)
ext0, feat_imp=fit_model_using_classifier(ext0, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=repeat_numb,
                                          scoring=scoref)

In [None]:
extra_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": ext0.predict_proba(test_feature_final[explanatory_variables])[:,1]})
extra_model_submission.to_csv("extra_model_submission.csv",index=False)

In [None]:
%%time
log0 =LogisticRegression(random_state=42,max_iter=12000, C=1.6)
# here I increased iteration number from the low default value to avoid warnings
# regularization param, arbitrarily decreased to respect large number of variables (default C = 1.0)
log0, feat_imp=fit_model_using_classifier(log0, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=repeat_numb*3,
                                          scoring=scoref)

In [None]:
log_coef=pd.DataFrame({
                    "variable":explanatory_variables,
                    "coeff_abs":list(abs(log0.coef_)[0])})
log_coef.sort_values(by=["coeff_abs"],ascending=[False],inplace=True)
log_coef.set_index("variable",inplace=True)
plt.figure(figsize=(20,6))
log_coef.head(60)["coeff_abs"].plot(kind='bar', title='Coefficient estimates',fontsize=12, color="#CBC3E3")
plt.ylabel('Coefficient estimates (absolute value)');

In [None]:
log_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": log0.predict_proba(test_feature_final[explanatory_variables])[:,1]})
log_model_submission.to_csv("log_model_submission.csv",index=False)

In [None]:
%%time
rfs0 =RandomForestClassifier(random_state=42,n_estimators=300, min_samples_split=200, min_samples_leaf=100, n_jobs=-1)
# added arbitrary number for min_sample_split to avoid overfitting... (I got f1=1 insample with default settings)
rfs0, feat_imp=fit_model_using_classifier(rfs0, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=repeat_numb,
                                          scoring=scoref)

In [None]:
rfs_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": rfs0.predict_proba(test_feature_final[explanatory_variables])[:,1]})
rfs_model_submission.to_csv("rfs_model_submission.csv",index=False)

In [None]:
# %%time
# svc0 = SVC(random_state=42, max_iter=12000,kernel="linear",probability=True)
# svc0, feat_imp=fit_model_using_classifier(svc0, 
#                                           dtrain=train_feature_final, 
#                                           predictors=explanatory_variables,
#                                           repeat=repeat_numb,
#                                           scoring=scoref)

In [None]:
# svc_model_submission=pd.DataFrame({
#     "sequence": test_feature_final["sequence"],
#     "state": svc0.predict(test_feature_final[explanatory_variables])})
# svc_model_submission.to_csv("svc_model_submission.csv",index=False)

In [None]:
%%time
lda0 = LinearDiscriminantAnalysis()
lda0, feat_imp=fit_model_using_classifier(lda0, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=repeat_numb,
                                          scoring=scoref)

In [None]:
lda_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": lda0.predict_proba(test_feature_final[explanatory_variables])[:,1]})
lda_model_submission.to_csv("lda_model_submission.csv",index=False)

# 5. Ensemble approach <a name="ens"></a>

In [None]:
def get_models():
    models = list()
    models.append(('extra', ext0))
    models.append(('xgb', gbm0))
    models.append(('log', log0))
    models.append(('rfs0', rfs0))
    models.append(('lda0', lda0))
    return models

In [None]:
for i in range(42,50):
    train_in, train_out=train_test_split(train_feature_final,test_size=0.33, random_state=i)
    blender, meta_X=fit_ensemble(models=get_models(),
                     X_train=train_in[explanatory_variables],
                     X_val=train_out[explanatory_variables],
                     y_train=train_in["state"],
                     y_val=train_out["state"], soft_vote=True)

    pred_state=predict_ensemble(models=get_models(), blender=blender, X_test=train_out[explanatory_variables])
    print(metrics.roc_auc_score(train_out["state"],pred_state))

In [None]:
blender_final, _ =fit_ensemble(models=get_models(),
             X_train=train_feature_final[explanatory_variables],
             X_val=None,
             y_train=train_feature_final["state"],
             y_val=None)

In [None]:
ens_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": predict_ensemble(models=get_models(), blender=blender_final, X_test=test_feature_final[explanatory_variables])
})
ens_model_submission.to_csv("ens_model_submission.csv",index=False)

# 6. Acknowledgement  <a name="ack"></a>

In [None]:
"""
    I got the idea to use freuencies and Fourier transform by looking at Pavel Salikov's notebook
    https://www.kaggle.com/code/matanivanov/lgbm-with-fourier-transform
    
    The fit_model_using_classifier function is based on this article
    //www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    
    To create my ensemble solution I used this source:
    https://machinelearningmastery.com/blending-ensemble-machine-learning-with-python/

"""