# Notes and Summary of Kernel 

This is a experiment on feature extraction and feature selection, the inital idea and part of the code came from this fantastic notebook from [JEROENVDD](https://www.kaggle.com/code/jeroenvdd/tsflex-x-tsfresh-feature-extraction) and the [tsflex github](https://github.com/predict-idlab/tsflex)

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import seaborn as sns 

#from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost  import XGBClassifier

from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.metrics import roc_auc_score, auc

import gc


In [None]:
EPOCHS = 4096
FOLDS = 10

DROP_SENSOR= False

SCALING = True

CALIBRATION = True

Additional features taken from my [own notebook](https://www.kaggle.com/code/slythe/feature-extraction-tsflex-catch22/notebook?scriptVersionId=93997916) with reference to  [broccoli beef ](kaggle.com/code/siukeitin/tps042022-fe-2500-features-with-tsfresh-catch22/notebook)


In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test_original = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")

train_feats = pd.read_csv("../input/tps-april-powershap-features/train_add_feats.csv")
test_feats = pd.read_csv("../input/tps-april-powershap-features/test_add_feats.csv")
train_lables = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
sub= pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv", index_col = 0)

In [None]:
sensor_cols = [col for col in train_original.columns if "sensor" in col]
sensor_cols

In [None]:
print("train shape:",train_feats.shape)
print("test shape:",test_feats.shape)
train_feats

# Added features 


In [None]:
train_pivoted = train_original.pivot(index=['sequence', 'subject'], columns='step', values=sensor_cols)
test_pivoted = test_original.pivot(index=['sequence', 'subject'], columns='step', values=sensor_cols)

In [None]:
import scipy

def add_features(df):
    #new_df = pd.DataFrame()
    #df_pivot = df.pivot(index = ["sequence","subject"], columns ="step", values = sensor_cols)
    
    for col in sensor_cols:
        df[f"mean_{col}"] = df[col].mean(axis = 1)
#         df[f"median_{col}"] = df[col].median(axis = 1)
        df[f"std_{col}"] = df[col].std(axis = 1)
#         df[f"variance_{col}"] = df[col].std(axis = 1)
#         df[f"max_{col}"] = df[col].max(axis = 1)
#         df[f"min_{col}"] = df[col].min(axis = 1)
        df[f"max-min_{col}"] = df[col].max(axis = 1) - df[col].min(axis = 1)
        df[f"q01_{col}"] = df[col].quantile(q= 0.01, axis =1)
        df[f"q25_{col}"] = df[col].quantile(q= 0.25, axis =1) 
        df[f"q50_{col}"] = df[col].quantile(q= 0.5, axis =1)
        df[f"q75_{col}"] = df[col].quantile(q= 0.75, axis =1)
        df[f"q95_{col}"] = df[col].quantile(q= 0.95, axis =1)
        df[f"q99_{col}"] = df[col].quantile(q= 0.99, axis =1)
        df[f"skew_{col}"] = df[col].skew( axis =1)
        
        #From AMBROSM  --> https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn#Cross-validation            
        df[col + '_iqr'] = scipy.stats.iqr(df[col], axis=1)
        df[col + '_sm'] = np.nan_to_num(df[f"std_{col}"] / 
                                               df[f"mean_{col}"].abs()).clip(-1e30, 1e30)
        df[f"kurtosis_{col}"] = scipy.stats.kurtosis(df[col], axis=1)
    
    df['sensor_02_up'] = (df.sensor_02.diff(axis=1) > 0).sum(axis=1)
    df['sensor_02_down'] = (df.sensor_02.diff(axis=1) < 0).sum(axis=1)
    df['sensor_02_upsum'] = df.sensor_02.diff(axis=1).clip(0, None).sum(axis=1)
    df['sensor_02_downsum'] = df.sensor_02.diff(axis=1) .clip(None, 0).sum(axis=1)
    df['sensor_02_upmax'] = df.sensor_02.diff(axis=1).max(axis=1)
    df['sensor_02_downmax'] = df.sensor_02.diff(axis=1).min(axis=1)
    df['sensor_02_upmean'] = np.nan_to_num(df['sensor_02_upsum'] / df['sensor_02_up'], posinf=40)
    df['sensor_02_downmean'] = np.nan_to_num(df['sensor_02_downsum'] / df['sensor_02_down'], neginf=-40)
    
    
    ## Trying as per my EDA
    df['sensor_02_mean_up'] = df["sensor_02"].mean(axis = 1) > -0.2
    df['sensor_02_mean_up'] = df['sensor_02_mean_up'].astype(int)
    
    df['sensor_02_std_up'] = df["sensor_02"].std(axis = 1) > 2.0
    df['sensor_02_std_up']  = df['sensor_02_std_up'] .astype(int)

    df['sensor_12_std_up'] = df["sensor_12"].std(axis = 1) > 39
    df['sensor_12_std_up'] =  df['sensor_12_std_up'].astype(int)
    
    df['sensor_04_std_up'] = df["sensor_04"].std(axis = 1) > 1.68
    df['sensor_04_std_up'] = df['sensor_04_std_up'].astype(int)
    
    
    #df = df.merge(new_df, how = "left", on = "sequence")   
    return df

add_features(train_pivoted)
add_features(test_pivoted)

print("train pvt shape:",train_pivoted.shape)
print("test pvt  shape:",test_pivoted.shape)

In [None]:
if DROP_SENSOR:
    print("dropping raw sensor data")
    train_pivoted = train_pivoted.drop(sensor_cols,axis =1)
    test_pivoted = test_pivoted.drop(sensor_cols,axis =1)

In [None]:
# sub_stat_train= train_original[['sequence', 'subject']].drop_duplicates().groupby('subject').agg({'sequence': 'count'}).rename(columns={'sequence': 'count'}).reset_index()
# train_pivoted["count"] = train_pivoted.merge(sub_stat_train,left_on= train_pivoted.index.get_level_values("subject"), right_on="subject")["count"].values

# sub_stat_test= test_original[['sequence', 'subject']].drop_duplicates().groupby('subject').agg({'sequence': 'count'}).rename(columns={'sequence': 'count'}).reset_index()
# test_pivoted["count"] = test_pivoted.merge(sub_stat_test,left_on= test_pivoted.index.get_level_values("subject"), right_on="subject")["count"].values

# train_pivoted

In [None]:
train_feats = train_feats.merge(train_pivoted,right_on = ["sequence", "subject"] , left_on = [train_pivoted.index.get_level_values(0), train_pivoted.index.get_level_values(1)])
test_feats = test_feats.merge(test_pivoted,right_on = ["sequence", "subject"] , left_on = [test_pivoted.index.get_level_values(0), test_pivoted.index.get_level_values(1)])

In [None]:
drop_cols = [('sensor_01', 50), ('sensor_12_std_up', ''), ('sensor_02_std_up', ''), ('sensor_02', 16), ('sensor_09_sm', ''), ('sensor_02_mean_up', ''), ('sensor_12_sm', ''), ('sensor_01', 46), ('sensor_01', 2), ('sensor_08_sm', ''), ('sensor_11_sm', ''), ('sensor_01', 38), ('sensor_01', 37), ('sensor_00', 3), ('sensor_01', 28), ('sensor_07_sm', ''), ('sensor_00', 26), ('sensor_00', 28), ('sensor_01', 23), ('sensor_00', 52), ('sensor_02', 35), ('sensor_05', 46), ('sensor_02', 36), ('sensor_07', 54), ('sensor_08', 14), ('sensor_11', 53), ('sensor_11', 55), ('sensor_08', 7), ('sensor_08', 6), ('sensor_12', 0), ('sensor_08', 2), ('sensor_12', 11), ('sensor_11', 41), ('sensor_07', 51), ('sensor_07', 38), ('sensor_07', 28), ('sensor_12', 26), ('sensor_12', 29), ('sensor_07', 1), ('sensor_06', 59), ('sensor_08', 17), ('sensor_08', 23), ('sensor_06', 54), ('sensor_09', 17), ('sensor_09', 53), ('sensor_10', 42), ('sensor_10', 48), ('sensor_10', 52), ('sensor_10', 56), ('sensor_09', 26), ('sensor_09', 20), ('sensor_11', 11), ('sensor_08', 27), ('sensor_11', 18), ('sensor_11', 20), ('sensor_09', 4), ('sensor_11', 25), ('sensor_08', 50), ('sensor_08', 36), ('sensor_08', 30), ('sensor_12', 38), ('sensor_06', 51), ('sensor_02', 49), ('sensor_03', 44), ('sensor_04', 30), ('sensor_03_sm', ''), ('sensor_04', 23), ('sensor_04', 19), ('sensor_04', 13), ('sensor_04', 1), ('sensor_03', 58), ('sensor_05_sm', ''), ('sensor_05', 2), ('sensor_03', 28), ('sensor_03', 22), ('sensor_03', 20), ('sensor_06_sm', ''), ('sensor_03', 18), ('sensor_02', 54), ('sensor_02', 53), ('sensor_04', 49), ('sensor_05', 11), ('sensor_06', 50), ('sensor_00_sm', ''), ('sensor_06', 47), ('sensor_06', 43), ('sensor_06', 42), ('sensor_06', 30), ('sensor_06', 23), ('sensor_05', 55), ('sensor_05', 50), ('sensor_10', 21), ('sensor_05', 13), ('sensor_05', 38), ('sensor_05', 37), ('sensor_05', 36), ('sensor_05', 30), ('sensor_01_sm', ''), ('sensor_05', 23), ('sensor_05', 16), ('sensor_04_std_up', ''),
            ('sensor_00_sm', ''), ('sensor_01_sm', ''), ('sensor_03_sm', ''),('sensor_05_sm', ''),('sensor_06_sm', ''), ('sensor_07_sm', ''),('sensor_08_sm', ''),
             ('sensor_09_sm', ''),('sensor_11_sm', ''),('sensor_12_sm', ''),('sensor_02_std_up', ''),('sensor_04_std_up', '')]
train_feats.drop(drop_cols, axis =1 , inplace = True)
test_feats.drop(drop_cols, axis =1 , inplace = True)

In [None]:
print("train shape:",train_feats.shape)
print("test shape:",test_feats.shape)
train_feats

# Downcasting 

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

reduce_mem_usage(train_feats)
reduce_mem_usage(test_feats)

# Split & Scale

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_feats, train_lables["state"], test_size=0.33, shuffle = False)

In [None]:
if SCALING:
    print("scaling")
    scaler= StandardScaler()
    X_train = scaler.fit_transform(X_train) 
    X_test = scaler.transform(X_test)
    test_s = scaler.transform(test_feats)
    
#     scaler = QuantileTransformer(n_quantiles=2000, 
#                              output_distribution='normal', 
#                              random_state=42).fit(X_train)
#     X_train = scaler.fit_transform(X_train) 
#     X_test = scaler.transform(X_test)
#     test_s = scaler.transform(test_feats.drop(["sequence", "subject"], axis =1))

# Build and Train

from [C4RL05/V](https://www.kaggle.com/code/cv13j0/tps-apr-2022-xgboost-model#2.-Setting-the-Notebook)  

In [None]:
params = {'n_estimators': EPOCHS,
          'max_depth': 6,
          'learning_rate': 0.015,
          'subsample': 0.95,
          'colsample_bytree': 0.60,
          'reg_lambda': 1.5,
          'reg_alpha': 6.1,
          'gamma': 6.1, #6,
          'random_state': 69,
          'objective': 'binary:logistic',
          # use hist method for leaf wise growth
          'tree_method': 'hist'  #"hist"#'gpu_hist',
         }

In [None]:
xgb = XGBClassifier(**params,use_label_encoder=False )
xgb.fit(X_train, y_train, eval_set = [(X_test, y_test)], eval_metric = ['auc'], early_stopping_rounds = 128, verbose = 50)

In [None]:
val_preds = xgb.predict_proba(X_test)
train_preds = xgb.predict_proba(X_train)

print("Validation AUC:" , roc_auc_score(y_test, val_preds[:, 1] ))
print("Intrinsic AUC:", roc_auc_score(y_train, train_preds[:, 1]  ))
val_preds

In [None]:
feat_importance = pd.DataFrame(data = xgb.feature_importances_,index = train_feats.columns.get_level_values(0), columns = ["feature_importance"]).sort_values(by=['feature_importance'], ascending=False)
feat_importance[feat_importance['feature_importance']>0].head(20)

In [None]:
feat_importance[feat_importance['feature_importance']>0].tail(20)

In [None]:
print("FEATURES with 0 importance\n")
print([col for col in feat_importance[feat_importance['feature_importance']==0].index])

# Calibration plot & recalibration
https://wttech.blog/blog/2021/a-guide-to-model-calibration/

In [None]:
prob_true, prob_pred = calibration_curve(y_test, val_preds[:,1], n_bins=10)

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
plt.plot(prob_pred,prob_true, marker='o', linewidth=1, label='xgb model probabilities')

# reference line
line = mlines.Line2D([0, 1], [0, 1], color='black')
transform = ax.transAxes
line.set_transform(transform)
ax.add_line(line)
plt.axvline(x=0.2, color = "r")
fig.suptitle('Calibration plot')
ax.set_xlabel('Predicted probability (mean)')
ax.set_ylabel('Fraction of positives (%True  in each bin)')
plt.legend()
plt.show()

#### Notes on graph:
We have a S-shaped calibration plot

This showcases that we are:
* **Underpredicting** where the fraction of True positive values is **higher** 
* **Over predicting** where the fraction of True positive values is **lower**

i.e. if we have 20% of postive values in our data (in otherwords 20% Accuracy) we should predict 20% postive values
However (from the red line graph) we are predicting around under 30% probability  --> Overpredicting

Calibration tries to fix this by "shifting" the values using calibration methods such as:
* isotonic 
* sigmoid (Platts method) 

In [None]:
calibrator = CalibratedClassifierCV(xgb, method = "isotonic", cv='prefit')
calibrator.fit(X_test, y_test)
cal_preds = calibrator.predict_proba(X_test)

print("Validation AUC:" , roc_auc_score(y_test, val_preds[:, 1] ))
print("calibrated AUC:" , roc_auc_score(y_test, cal_preds[:, 1] ))

# Cross Validation

In [None]:
del xgb
del X_train
del y_train
del X_test
del y_test
del val_preds
del train_preds
del calibrator

gc.collect()

In [None]:
X = train_feats
y = train_lables["state"]

In [None]:
#kfold = KFold(n_splits = FOLDS)
kfold = KFold(n_splits = FOLDS)

In [None]:
auc_cv = []
preds = []
feature_importances = []

for fold, (train_idx, val_idx) in enumerate (kfold.split(X,y, groups =train_pivoted.index.get_level_values("sequence"))):

    print("\n","#"*10, f"Fold {fold+1}","#"*10)
    X_train, X_test = X.iloc[train_idx] , X.iloc[val_idx]
    y_train , y_test = y.iloc[train_idx], y.iloc[val_idx]
   
    if SCALING:
        print("scaling")
        scaler= StandardScaler()
        X_train = scaler.fit_transform(X_train) 
        X_test = scaler.transform(X_test)
        test_s = scaler.transform(test_feats)
    else:
        test_s = test_feats.copy()
    
    model = XGBClassifier(**params,use_label_encoder=False)
    model.fit(X_train, y_train, eval_set = [(X_test, y_test)], eval_metric = ['auc'], early_stopping_rounds = 128, verbose = 50)
    
    #save feature importances
    feature_importances.append(model.feature_importances_)
    
    if CALIBRATION:
        calibrator = CalibratedClassifierCV(model, method = "isotonic", cv='prefit')
        calibrator.fit(X_test, y_test)
        auc = roc_auc_score(y_test, calibrator.predict_proba(X_test)[:, 1])
        print("\n Calibration AUC:" , auc)
        preds.append(calibrator.predict_proba(test_s)[:, 1])
    else:
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        print("\n Validation AUC:" , auc)
        preds.append(model.predict_proba(test_s)[:, 1])

    auc_cv.append(auc)

    del X_train
    del X_test
    del y_train
    del y_test
    del model
    #del calibrator
    
    gc.collect()

print("FINAL AUC: ", np.mean(auc_cv))

In [None]:
importances= pd.DataFrame(index = train_feats.columns.get_level_values(0),
             columns = [f"fold0"])
for i in range(FOLDS):
    importances[f"fold{i}"] = feature_importances[i]
importances.to_csv("feature_importances.csv")
importances

# Submission

In [None]:
final_preds = np.sum(preds,axis =0)/FOLDS
sub["state"] = final_preds
sub.to_csv("submission.csv")
sub

In [None]:
plt.figure(figsize = (20,8))
sns.histplot(sub["state"])
plt.show()