In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.options.display.float_format = '{:,.2f}'.format
import os

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kurtosis, skew
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import f_classif
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn import metrics
from scipy.fft import rfft
from catboost import CatBoostClassifier
import scipy

!pip install seglearn tsflex antropy catch22
from seglearn.feature_functions import emg_features
from tsflex.features.integrations import seglearn_feature_dict_wrapper
from tsflex.features import MultipleFeatureDescriptors,FeatureCollection
from catch22 import catch22_all
import antropy as ent

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 0. Helper functions <a name="help"></a>

In [None]:
def fit_model_using_classifier(alg,
                               dtrain,
                               predictors,
                               target="state",
                               performCV=True, 
                               printFeatureImportance=True, 
                               cv_folds=3,
                               repeat=5,
                               scoring='roc_auc',
                               only_top_x_feature=60
                              ):
    """
    I used the function found in this source
    https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    I modified the code slightly
    """

    # Perform cross-validation:
    cv_score=list()
    if performCV:
        for i in range(0,repeat):
            cv_score_temp = cross_val_score(
                            alg, 
                            dtrain[predictors], 
                            dtrain[target], 
                            cv=cv_folds, 
                            scoring=scoring)
            cv_score=cv_score+list(cv_score_temp)
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]        
    
    # Print model report:
    print("\nModel Report")
    print("Accuracy : " + str(round(metrics.accuracy_score(
        dtrain[target].values, dtrain_predictions),4)))
    print("AUC Score (Train): " + str(round(
        metrics.roc_auc_score(dtrain[target], dtrain_predprob),4)))
    
    if performCV:
        print("\n Cross validation summary ("+scoring+")")
        print("Average: "+str(round(np.mean(cv_score),4)))
        print("Std    : "+str(round(np.std(cv_score),4)))
        print("Min    : "+str(round(np.min(cv_score),4)))
        print("Max    : "+str(round(np.max(cv_score),4)))
                
    # Print Feature Importance:
    if printFeatureImportance and "feature_importances_" in dir(alg):
        plt.figure(figsize=(20,6))
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.head(only_top_x_feature).plot(kind='bar', title='Feature Importances',fontsize=12, color="#CBC3E3")
        plt.ylabel('Feature Importance Score')
        return alg, feat_imp
    else:
        return alg, list()

def generate_features(df, metric_data, group_variables, sensor_identifiers, suffix=""):
    """
    Generates the features based on the provided metric_data map
    
    """
    all_metrics=pd.DataFrame(columns=group_variables)
    for sensor_number in sensor_numbs:
        sensor_v="sensor_"+sensor_number
        # I had to use this words solution because of list formatting
        metrics = [ listv[0] 
                        for key, listv in metric_data.items()]
        metric_cols=[key+sensor_number+suffix for key in metric_data.keys()]

        temp_metrics=df.groupby(group_variables).agg({
            sensor_v: metrics
        }).reset_index()
        temp_metrics.columns=group_variables+metric_cols
        all_metrics=all_metrics.merge(temp_metrics,how="outer",on=group_variables)

    # finally we save down the variable names as well
    generated_columns=list(set(all_metrics.columns)-set(group_variables))
    generated_columns.sort()
    return all_metrics, generated_columns

def create_frequencies(groups):
    """
    Create frequencies up to frequency 30.
    source https://www.kaggle.com/code/matanivanov/lgbm-with-fourier-transform
    """
    return pd.concat(
        [pd.Series(np.abs(rfft(groups[col].values)), 
                   index=[f'{col}_freq_{i}' for i in range(31)]) 
         for col in groups.columns if col not in ['sequence', 'subject', 'step']
        ])

# 1. Load and Explore data <a name="introduction"></a>

In [None]:
train_labels=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv")
display(train_labels.head())
train=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train.csv")
display(train.head())
display(train.describe())
# At first glance no problem with missing variables
display(train.info())
test=pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/test.csv")
display(test.head())
display(test.describe())
# At first glance no problem with missing variables
display(test.info())

In [None]:
# let us define a list of sensors for convinience
sensors=list(test.columns[3:16])
sensor_numbs=[sensor[7:9] for sensor in sensors]

# 2. Variable construction <a name="vars"></a>

In [None]:
%%time
train_freq = train.sort_values(['subject', 'sequence', 'step']).groupby(['sequence', 'subject']).apply(create_frequencies)
train_freq.reset_index(inplace=True)

test_freq = test.sort_values(['subject', 'sequence', 'step']).groupby(['sequence', 'subject']).apply(create_frequencies)
test_freq.reset_index(inplace=True)
freq_columns=list(train_freq.columns[2:])

In [None]:
print(len(train))

In [None]:
metric_data_short={
    "mean_": [np.nanmean],
    "std_": [np.nanstd]
}
# mean and std achieves around 80-82% f1 score

# lets extend these with new metrics
def auto_corr_1(x):
    return np.corrcoef(x[1:],x[:-1])[0,1]
def auto_corr_2(x):
    return np.corrcoef(x[2:],x[:-2])[0,1]

def p5(x):
    return np.percentile(x,5)
def p10(x):
    return np.percentile(x,10)
def p25(x):
    return np.percentile(x,25)
def p75(x):
    return np.percentile(x,75)
def p90(x):
    return np.percentile(x,90)
def p95(x):
    return np.percentile(x,95)

def diqr(x):
    dd=x.diff().dropna()
    return scipy.stats.iqr(dd)

def mean_diff(x):
    dd=x.diff()
    return np.nanmean(dd)

def std_diff(x):
    dd=x.diff()
    return np.nanstd(dd)

def auto_corr_1_diff(x):
    dd=x.diff().dropna()
    return np.corrcoef(dd[1:],dd[:-1])[0,1]

def auto_corr_2_diff(x):
    dd=x.diff().dropna()
    return np.corrcoef(dd[2:],dd[:-2])[0,1]

def skew_diff(x):
    dd=x.diff().dropna()
    return skew(dd)

def kurtosis_diff(x):
    dd=x.diff().dropna()
    return kurtosis(dd)

def dp5(x):
    dd=x.diff().dropna()
    return np.percentile(dd,5)
def dp10(x):
    dd=x.diff().dropna()
    return np.percentile(dd,10)
def dp25(x):
    dd=x.diff().dropna()
    return np.percentile(dd,25)
def dp75(x):
    dd=x.diff().dropna()
    return np.percentile(dd,75)
def dp90(x):
    dd=x.diff().dropna()
    return np.percentile(dd,90)
def dp95(x):
    dd=x.diff().dropna()
    return np.percentile(dd,95)

def dmin(x):
    dd=x.diff().dropna()
    return np.nanmin(dd)

def dmax(x):
    dd=x.diff().dropna()
    return np.nanmax(dd)

metric_data={
    "mean_": [np.nanmean],
    "std_": [np.nanstd],
    "median_": [np.median],
    "p05_": [p5],
    "p10_": [p10],
    "p25_": [p25],
    "p75_": [p75],
    "p90_": [p90],
    "p95_": [p95],
    "min_": [np.nanmin],
    "max_": [np.nanmax],
    "iqr_": [scipy.stats.iqr],
    "skew_": [skew],
    "kurtosis_": [kurtosis],
    "corr1_": [auto_corr_1],
    "corr2_": [auto_corr_2],
    "d_mean_": [mean_diff],
    "d_std_": [std_diff],
    "d_corr1_": [auto_corr_1_diff],
    "d_corr2_": [auto_corr_2_diff],
    "d_skew_": [skew_diff],
    "d_kurtosis_": [kurtosis_diff],
    "d_min_": [dmin],
    "d_p05_": [dp5],
    "d_p10_": [dp10],
    "d_p25_": [dp25],
    "d_p75_": [dp75],
    "d_p90_": [dp90],
    "d_p95_": [dp95],
    "d_max_": [dmax],
    "d_iqr_": [diqr]
}

metric_data_subj={
    "mean_": [np.nanmean],
    "std_": [np.nanstd],
    "median_": [np.median],
    "p05_": [p5],
    "p10_": [p10],
    "p25_": [p25],
    "p75_": [p75],
    "p90_": [p90],
    "p95_": [p95],
    "min_": [np.nanmin],
    "max_": [np.nanmax],
    "skew_": [skew],
    "kurtosis_": [kurtosis]
}


In [None]:
%%time
    train_features, gen_col_train = generate_features(df=train, 
                                                      metric_data=metric_data, 
                                                      group_variables=["sequence"], 
                                                      sensor_identifiers=sensor_numbs, 
                                                      suffix="")
    test_features, gen_col_test = generate_features(df=test, 
                                                    metric_data=metric_data, 
                                                    group_variables=["sequence"], 
                                                    sensor_identifiers=sensor_numbs, 
                                                    suffix="")

    train_features_s, gen_col_train_s = generate_features(df=train, 
                                                      metric_data=metric_data_subj, 
                                                      group_variables=["subject"], 
                                                      sensor_identifiers=sensor_numbs, 
                                                      suffix="_subj")
    test_features_s, gen_col_test_s = generate_features(df=test, 
                                                    metric_data=metric_data_subj, 
                                                    group_variables=["subject"], 
                                                    sensor_identifiers=sensor_numbs, 
                                                    suffix="_subj")

In [None]:
print(len(train))

In [None]:
ent = MultipleFeatureDescriptors(
        functions=[ent.perm_entropy, ent.sample_entropy, ent.petrosian_fd, ent.svd_entropy],
        series_names=sensors,
        windows=60,
        strides=60,
    )
ent_collect = FeatureCollection(ent)

train_ent= ent_collect.calculate(train.copy().astype(np.float32), show_progress=True, return_df=True, window_idx="begin")
test_ent= ent_collect.calculate(test.copy().astype(np.float32), show_progress=True, return_df=True, window_idx="begin")
test_ent.replace([np.inf, -np.inf], 0,inplace=True)
train_ent.replace([np.inf, -np.inf], 0,inplace=True)
ent_cols=list(test_ent.columns).copy()

In [None]:
# We use the emg_features from tsflex to generate more features so we can later experiment with these ones as well :) 
emg_feats = MultipleFeatureDescriptors(
        functions=seglearn_feature_dict_wrapper(emg_features()),
        series_names=sensors,
        windows=60,
        strides=60,
    )

emg_feature_collect = FeatureCollection(emg_feats)
train_emg = emg_feature_collect.calculate(train.copy().astype(np.float32), show_progress=True, return_df=True, window_idx="begin")
test_emg = emg_feature_collect.calculate(test.copy().astype(np.float32), show_progress=True, return_df=True, window_idx="begin")
emg_cols=list(test_emg.columns).copy()

In [None]:
train_emg["sequence"]=train_features["sequence"].copy()
test_emg["sequence"]=test_features["sequence"].copy()

train_ent["sequence"]=train_features["sequence"].copy()
test_ent["sequence"]=test_features["sequence"].copy()

# Special features for sensor 2!

In [None]:
%%time
# Standard deviation is among the most important features, so I extend the concept a little bit...
def poz_mean(x):
    y=x.diff()
    y=np.where(y>0,y,0)
    return np.nanmean(y)

def neg_mean(x):
    y=x.diff()
    y=np.where(y<0,y,0)
    return np.nanmean(y)

def poz_std(x):
    y=x.diff()
    y=np.where(y>0,y,0)
    return np.nanmean(y)

def neg_std(x):
    y=x.diff()
    y=np.where(y<0,y,0)
    return np.nanstd(y)

metric_data_s2={
    "poz_mean_": [poz_mean],
    "neg_mean_": [neg_mean],
    "poz_std_": [poz_std],
    "neg_std_": [neg_std]
}

train_features_s2, gen_col_train_s2 = generate_features(df=train, 
                                                      metric_data=metric_data_s2, 
                                                      group_variables=["sequence"], 
                                                      sensor_identifiers=["02"], 
                                                      suffix="")

test_features_s2, gen_col_test_s2 = generate_features(df=test, 
                                                      metric_data=metric_data_s2, 
                                                      group_variables=["sequence"], 
                                                      sensor_identifiers=["02"], 
                                                      suffix="")

train_features_s2.replace([np.inf, -np.inf], 0,inplace=True)
test_features_s2.replace([np.inf, -np.inf], 0,inplace=True)

In [None]:
train_features_s2.head()

In [None]:
# number of sequences per subject
train_subj_len=train[["sequence","subject"]].groupby("subject").count().reset_index()
train_subj_len.columns=["subject","sequence_len"]
test_subj_len=test[["sequence","subject"]].groupby("subject").count().reset_index()
test_subj_len.columns=["subject","sequence_len"]

# merge the tables together for train and test
train_feature_final=train_labels.merge(train[["subject","sequence"]].copy().drop_duplicates(),how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_features,how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_features_s,how="left",on="subject")
train_feature_final=train_feature_final.merge(train_subj_len,how="left",on="subject")
train_feature_final=train_feature_final.merge(train_emg,how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_ent,how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_features_s2,how="left",on="sequence")
train_feature_final=train_feature_final.merge(train_freq,how="left",on=['sequence', 'subject'])

# creating the col list.
explanatory_variables=gen_col_train+gen_col_train_s+["sequence_len"]+freq_columns+emg_cols+ent_cols+gen_col_test_s2

test_feature_final=test[["subject","sequence"]].copy().drop_duplicates()
test_feature_final=test_feature_final.merge(test_features,how="left",on="sequence")
test_feature_final=test_feature_final.merge(test_features_s,how="left",on="subject")
test_feature_final=test_feature_final.merge(test_subj_len,how="left",on="subject")
test_feature_final=test_feature_final.merge(test_emg,how="left",on="sequence")
test_feature_final=test_feature_final.merge(test_ent,how="left",on="sequence")
test_feature_final=test_feature_final.merge(test_features_s2,how="left",on="sequence")
test_feature_final=test_feature_final.merge(test_freq,how="left",on=['sequence', 'subject'])

test_feature_final.fillna(0,inplace=True)
train_feature_final.fillna(0,inplace=True)

In [None]:
# # Transform train and test to 0-1 scale
scaler = MinMaxScaler()
train_feature_final.loc[:,explanatory_variables]=scaler.fit_transform(train_feature_final.loc[:,explanatory_variables])
test_feature_final.loc[:,explanatory_variables]=scaler.transform(test_feature_final.loc[:,explanatory_variables])

# we have skewed distributions, we apply a sqrt functional form which translates the distribution to a least skewed one.
for var in explanatory_variables:
    if (train_feature_final[var].skew()) > 3 and not train_feature_final[var].min()<0.0:
        train_feature_final[var]=np.sqrt(train_feature_final[var])
        test_feature_final[var]=np.sqrt(test_feature_final[var])
        
test_feature_final.fillna(0,inplace=True)
train_feature_final.fillna(0,inplace=True)

In [None]:
selected_features=["kurtosis_04", 
                   "sequence_len",
                   "std_02",
                   "kurtosis_10",
                   "sensor_09_freq_0",
                   "sensor_09_freq_1",
                   "sensor_01_freq_0",
                   "sensor_02_freq_2",
                   "p05_09",
                   "max_05",
                   "p25_10",
                   "p10_04",
                   "poz_std_02",
                   "neg_std_02",
                   "poz_mean_02",
                   "neg_mean_02",
                   "d_mean_04"
                  ]

In [None]:
for feature in selected_features:
    plt.figure(figsize=(20,6))
    plt.hist(train_feature_final[train_feature_final["state"]<1][feature],bins=200, density=True, label='State : 0',color='#CBC3E3')
    plt.hist(train_feature_final[train_feature_final["state"]>0][feature],bins=200, density=True, label='State : 1',color='#F4B123', alpha = 0.5)
    plt.ylabel('Frequency')
    plt.title('Distribution of values for feature: '+feature, fontsize=15)
    plt.legend()
    plt.show()

# 3. Variable selection <a name="sel"></a>

In [None]:
scoref="roc_auc"
repeat_numb=5

In [None]:
# Here I create a reduced list of variable
# First I estimate a logit model with the unreduced set of var, then I estiamte the same model with the reduced set.

In [None]:
%%time
log0 =LogisticRegression(random_state=42,max_iter=12000, C=1.6)
# here I increased iteration number from the low default value to avoid warnings
# regularization param, arbitrarily decreased to respect large number of variables (default C = 1.0)
log0, feat_imp=fit_model_using_classifier(log0, 
                                          dtrain=train_feature_final, 
                                          predictors=explanatory_variables,
                                          repeat=repeat_numb,
                                          scoring=scoref)

In [None]:
%%time
perm_result = permutation_importance(log0, 
                                     X=train_feature_final[explanatory_variables],
                                     y=train_feature_final["state"], 
                                     n_repeats=10,
                                     scoring=scoref,
                                     random_state=42)

In [None]:
res_select=pd.DataFrame({
    "variable": explanatory_variables,
    "importances_mean": perm_result.importances_mean*100,
    "importances_std": perm_result.importances_std
})
res_select.sort_values(by=["importances_mean"],inplace=True,ascending=False)
res_select.to_csv("res_select.csv",index=False)

In [None]:
super_short_list=list(res_select[res_select["importances_mean"]>1]["variable"])
short_list=list(res_select[res_select["importances_mean"]>0.1]["variable"])
longer_list=list(res_select[res_select["importances_mean"]>0.05]["variable"])

In [None]:
print("Most important features")
print(super_short_list)

In [None]:
print("Original number of features: "+str(len(explanatory_variables)))
print("Short list number of features: "+str(len(short_list)))

# 4. LGBM Classifier <a name="vars"></a>

In [None]:
params_lgbm={
    "colsample_bytree": 0.8,
    "n_estimators": 500,
    "min_child_samples":50, 
    #"max_depth":3,
    "learning_rate":0.045
}

In [None]:
%%time
lgbm0 =LGBMClassifier(random_state=42, metric="roc_auc",objective="binary",**params_lgbm)
# I added some restrictions to avoid overfit.
lgbm0, feat_imp=fit_model_using_classifier(lgbm0, 
                                           dtrain=train_feature_final, 
                                           predictors=explanatory_variables,
                                           repeat=repeat_numb,
                                           scoring=scoref)

In [None]:
lgbm0_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": lgbm0.predict(test_feature_final[explanatory_variables])})
lgbm0_model_submission.to_csv("lgbm0_model_submission.csv",index=False)

In [None]:
lgbm0_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": lgbm0.predict_proba(test_feature_final[explanatory_variables])[:,1]})
lgbm0_model_submission.to_csv("lgbm0_model_prob_submission.csv",index=False)

In [None]:
%%time
lgbm2 =LGBMClassifier(random_state=42, metric="roc_auc",objective="binary",**params_lgbm)
# I added some restrictions to avoid overfit.
lgbm2, feat_imp=fit_model_using_classifier(lgbm2, 
                                           dtrain=train_feature_final, 
                                           predictors=longer_list,
                                           repeat=repeat_numb,
                                           scoring=scoref)

In [None]:
lgbm2_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": lgbm2.predict(test_feature_final[longer_list])})
lgbm2_model_submission.to_csv("lgbm2_model_submission.csv",index=False)

In [None]:
lgbm2_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": lgbm2.predict_proba(test_feature_final[longer_list])[:,1]})
lgbm2_model_submission.to_csv("lgbm2_model_prob_submission.csv",index=False)

# 5. Catboost classifier

In [None]:
cat_params={
    "iterations":4000,
    "learning_rate":0.025,
    'loss_function' : 'Logloss',
    "eval_metric":"AUC",
    "verbose":False
}

In [None]:
%%time
clf = CatBoostClassifier(
    **cat_params
)

# clf.fit(
#     train_feature_final[explanatory_variables], 
#     train_feature_final["state"], verbose=True
# )

clf, feat_imp=fit_model_using_classifier(clf, 
                                           dtrain=train_feature_final, 
                                           predictors=explanatory_variables,
                                           repeat=repeat_numb,
                                           scoring=scoref)

In [None]:
selected_features_cat=list(feat_imp[feat_imp>0.00].index)

In [None]:
clf_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": clf.predict(test_feature_final[explanatory_variables])})
clf_model_submission.to_csv("clf_model_submission.csv",index=False)

In [None]:
clf_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": clf.predict_proba(test_feature_final[explanatory_variables])[:,1]})
clf_model_submission.to_csv("clf_model_prob_submission.csv",index=False)

In [None]:
clf1 = CatBoostClassifier(
    **cat_params
)

clf1, feat_imp=fit_model_using_classifier(clf1, 
                                           dtrain=train_feature_final, 
                                           predictors=selected_features_cat,
                                           repeat=repeat_numb,
                                           scoring=scoref)

In [None]:
clf1_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": clf1.predict(test_feature_final[selected_features_cat])})
clf1_model_submission.to_csv("clf1_model_submission.csv",index=False)

In [None]:
clf1_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": clf1.predict_proba(test_feature_final[selected_features_cat])[:,1]})
clf1_model_submission.to_csv("clf1_model_prob_submission.csv",index=False)

In [None]:
# further decrease in variables
selected_features_cat=list(feat_imp[feat_imp>0.1].index)
selected_features_cat=list(set(longer_list).union(set(selected_features_cat)))

In [None]:
clf2 = CatBoostClassifier(
    **cat_params
)

clf2, feat_imp=fit_model_using_classifier(clf2, 
                                           dtrain=train_feature_final, 
                                           predictors=selected_features_cat,
                                           repeat=repeat_numb,
                                           scoring=scoref)

In [None]:
clf2_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": clf2.predict_proba(test_feature_final[selected_features_cat])[:,1]})
clf2_model_submission.to_csv("clf2_model_prob_submission.csv",index=False)

In [None]:
print(selected_features_cat)

# 6. XGBoost  <a name="xg"></a>

In [None]:
%%time
gbm1 =GradientBoostingClassifier(random_state=42)
gbm1, feat_imp=fit_model_using_classifier(gbm1, 
                                          dtrain=train_feature_final, 
                                          predictors=selected_features_cat,
                                          repeat=repeat_numb,
                                          scoring=scoref)

In [None]:
xgb_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": gbm1.predict_proba(test_feature_final[selected_features_cat])[:,1]})
xgb_model_submission.to_csv("xgb_model_submission_select.csv",index=False)

In [None]:
%%time
lgbm1 =LGBMClassifier(random_state=42, metric="roc_auc",objective="binary",**params_lgbm)
# I added some restrictions to avoid overfit.
lgbm1, feat_imp=fit_model_using_classifier(lgbm1, 
                                           dtrain=train_feature_final, 
                                           predictors=selected_features_cat,
                                           repeat=repeat_numb,
                                           scoring=scoref)

# 7. Ensemble  <a name="ens"></a>

In [None]:
def get_models():
    models = list()
    models.append(('lgbm1', lgbm1))
    models.append(('xgb', gbm1))
    models.append(('clf2', clf2))
    return models

def fit_ensemble(models, X_train, X_val, y_train, y_val, soft_vote=True):
    """
    Fit all models on the training set and predict on hold out set
    """
    meta_X = list()
    if X_val is None:
        X_val=X_train
    if y_val is None:
        y_val=y_train
    
    for name, model in models:
        model.fit(X_train, y_train)
        if soft_vote:
            yhat = model.predict_proba(X_val)[:,1]
        else:
            yhat = model.predict(X_val)
        yhat2 = yhat.reshape(len(yhat), 1)
        meta_X.append(yhat2)
        del yhat
    meta_X = np.hstack(meta_X)
    blender = CatBoostClassifier(verbose=False)
    blender.fit(meta_X, y_val)
    return blender, meta_X

def predict_ensemble(models, blender, X_test, soft_vote=True):
    """
    Predict outcome using the set of models
    """
    meta_X = list()
    for name, model in models:
        if soft_vote:
            yhat = model.predict_proba(X_test)[:,1]
        else:
            yhat = model.predict(X_test)
        yhat2 = yhat.reshape(len(yhat), 1)
        del yhat
        meta_X.append(yhat2)
    meta_X = np.hstack(meta_X)
    return blender.predict_proba(meta_X)[:,1]

In [None]:
%%time
for i in range(42,45):
    train_in, train_out=train_test_split(train_feature_final,test_size=0.33, random_state=i)
    blender, meta_X=fit_ensemble(models=get_models(),
                     X_train=train_in[selected_features_cat],
                     X_val=train_out[selected_features_cat],
                     y_train=train_in["state"],
                     y_val=train_out["state"], soft_vote=True)

    pred_state=predict_ensemble(models=get_models(), blender=blender, X_test=train_out[selected_features_cat])
    print(metrics.roc_auc_score(train_out["state"],pred_state))

In [None]:
blender_final, _ =fit_ensemble(models=get_models(),
             X_train=train_feature_final[selected_features_cat],
             X_val=None,
             y_train=train_feature_final["state"],
             y_val=None)

In [None]:
ens_model_submission=pd.DataFrame({
    "sequence": test_feature_final["sequence"],
    "state": predict_ensemble(models=get_models(), blender=blender_final, X_test=test_feature_final[selected_features_cat])
})
ens_model_submission.to_csv("ens_model_submission.csv",index=False)

# 8. Acknowledgement  <a name="ack"></a>

In [None]:
"""
    I got the idea to use freuencies and Fourier transform by looking at Pavel Salikov's notebook
    https://www.kaggle.com/code/matanivanov/lgbm-with-fourier-transform
    
    The fit_model_using_classifier function is based on this article
    //www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
    
    To create my ensemble solution I used this source:
    https://machinelearningmastery.com/blending-ensemble-machine-learning-with-python/
    
    I got the idea to use LGBM classifier from Kelly Belcher's notebook
    https://www.kaggle.com/code/kellibelcher/time-series-classification-with-lstms-sensor-eda

"""

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
# https://scikit-learn.org/stable/modules/generated/sklearn.inspection.permutation_importance.html
# https://www.kaggle.com/competitions/tabular-playground-series-apr-2022/discussion/318527
# https://predict-idlab.github.io/tsflex/features/
# https://www.kaggle.com/code/ahmetcelik158/tps-apr-22-lstm-with-pytorch#1.-Data-Preparation