# Notes and Summary of Kernel 

This is a experiment on feature extraction and feature selection, the inital idea and part of the code came from this fantastic notebook from [JEROENVDD](https://www.kaggle.com/code/jeroenvdd/tsflex-x-tsfresh-feature-extraction) and the [tsflex github](https://github.com/predict-idlab/tsflex)

In [None]:
!pip install tsflex tsfresh powershap catch22

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.transforms as mtransforms
import seaborn as sns 

#from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost  import XGBClassifier

from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupKFold, KFold
from sklearn.metrics import roc_auc_score, auc

import gc

#tsflex
from tsflex.features import FeatureCollection, MultipleFeatureDescriptors

# we use a tsfresh and catch22 wrappers to extract features 
from tsflex.features.integrations import tsfresh_settings_wrapper, catch22_wrapper

# for tsFRESH we use two settings to extract features: 'Comprehensive' includes many features, 'Minimal' only includes a smaller subset (of 'Comprehensive')
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters

from powershap import PowerShap
import catch22

In [None]:
EPOCHS = 4096
FOLDS = 10

DROP_SENSOR= True

ADD_FEATURES = True

SHIFT_VALS = False

CALIBRATION = False

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test_original = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
train_lables = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
sub= pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv", index_col = 0)

In [None]:
sensor_cols = [col for col in train_original.columns if "sensor" in col]
sensor_cols

# Downcasting 

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

reduce_mem_usage(train_original)
reduce_mem_usage(test_original)

# Additional Features

In [None]:
train_pivoted = train_original.pivot(index=['sequence', 'subject'], columns='step', values=sensor_cols)
test_pivoted = test_original.pivot(index=['sequence', 'subject'], columns='step', values=sensor_cols)

In [None]:
import scipy

def add_features(df):
    new_df = pd.DataFrame()
    #df_pivot = df.pivot(index = ["sequence","subject"], columns ="step", values = sensor_cols)
    
    for col in sensor_cols:
        df[f"mean_{col}"] = df[col].mean(axis = 1)
#         df[f"median_{col}"] = df[col].median(axis = 1)
        df[f"std_{col}"] = df[col].std(axis = 1)
#         df[f"variance_{col}"] = df[col].std(axis = 1)
#         df[f"max_{col}"] = df[col].max(axis = 1)
#         df[f"min_{col}"] = df[col].min(axis = 1)
        df[f"max-min_{col}"] = df[col].max(axis = 1) - df[col].min(axis = 1)
        #df[f"q10_{col}"] = df[col].quantile(q= 0.1, axis =1) 
        df[f"q25_{col}"] = df[col].quantile(q= 0.25, axis =1) 
        df[f"q50_{col}"] = df[col].quantile(q= 0.5, axis =1)
        df[f"q75_{col}"] = df[col].quantile(q= 0.75, axis =1)
        df[f"q95_{col}"] = df[col].quantile(q= 0.95, axis =1)
        df[f"q99_{col}"] = df[col].quantile(q= 0.99, axis =1)
        df[f"skew_{col}"] = df[col].skew( axis =1)
        
        #From AMBROSM  --> https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn#Cross-validation            
        df[col + '_iqr'] = scipy.stats.iqr(df[col], axis=1)
        df[col + '_sm'] = np.nan_to_num(df[f"std_{col}"] / 
                                               df[f"mean_{col}"].abs()).clip(-1e30, 1e30)
        df[f"kurtosis_{col}"] = scipy.stats.kurtosis(df[col], axis=1)
    
    df['sensor_02_up'] = (df.sensor_02.diff(axis=1) > 0).sum(axis=1)
    df['sensor_02_down'] = (df.sensor_02.diff(axis=1) < 0).sum(axis=1)
    df['sensor_02_upsum'] = df.sensor_02.diff(axis=1).clip(0, None).sum(axis=1)
    df['sensor_02_downsum'] = df.sensor_02.diff(axis=1) .clip(None, 0).sum(axis=1)
    df['sensor_02_upmax'] = df.sensor_02.diff(axis=1).max(axis=1)
    df['sensor_02_downmax'] = df.sensor_02.diff(axis=1).min(axis=1)
    df['sensor_02_upmean'] = np.nan_to_num(df['sensor_02_upsum'] / df['sensor_02_up'], posinf=40)
    df['sensor_02_downmean'] = np.nan_to_num(df['sensor_02_downsum'] / df['sensor_02_down'], neginf=-40)
    
    
    ## Trying as per my EDA
    df['sensor_02_mean_up'] = df["sensor_02"].mean(axis = 1) > -0.2
    df['sensor_02_mean_up'] = df['sensor_02_mean_up'].astype(int)
    
    df['sensor_02_std_up'] = df["sensor_02"].std(axis = 1) > 2.0
    df['sensor_02_std_up']  = df['sensor_02_std_up'] .astype(int)

    df['sensor_12_std_up'] = df["sensor_12"].std(axis = 1) > 39
    df['sensor_12_std_up'] =  df['sensor_12_std_up'].astype(int)
    
    df['sensor_04_std_up'] = df["sensor_04"].std(axis = 1) > 1.68
    df['sensor_04_std_up'] = df['sensor_04_std_up'].astype(int)
    
    
    #df = df.merge(new_df, how = "left", on = "sequence")   
    return df

if ADD_FEATURES:
    print("Adding Features")
    train = add_features(train_pivoted)
    test = add_features(test_pivoted)

else :
    train = train_pivoted.copy()

### Drop the sensor data
We will drop the original sensoe columns, as we have a number of other features that should represent this data \
We also have already run a LSTM model on this data without any additional features (as neural netorks prefer homogeneous data) 

As such my assumption is that this xgboost model will learn additional inferences which when ensembled with the LSTM will give greater insights


In [None]:
if DROP_SENSOR:
    print("dropping raw sensor data")
    train_pivoted = train_pivoted.drop(sensor_cols,axis =1)
    test_pivoted = test_pivoted.drop(sensor_cols,axis =1)

In [None]:
sub_stat_train= train_original[['sequence', 'subject']].drop_duplicates().groupby('subject').agg({'sequence': 'count'}).rename(columns={'sequence': 'count'}).reset_index()
train_pivoted = train_pivoted.merge(sub_stat_train,left_on= train_pivoted.index.get_level_values("subject"), right_on="subject").drop("subject",axis =1)

sub_stat_test= test_original[['sequence', 'subject']].drop_duplicates().groupby('subject').agg({'sequence': 'count'}).rename(columns={'sequence': 'count'}).reset_index()
test_pivoted = test_pivoted.merge(sub_stat_test,left_on= test_pivoted.index.get_level_values("subject"),  right_on="subject").drop("subject",axis =1)

train_pivoted

from https://www.kaggle.com/code/hasanbasriakcay/tpsapr22-fe-pseudo-labels-bi-lstm/notebook

In [None]:
def create_new_features(df):
    df['sensor_02_num'] = df['sensor_02'] > -15
    df['sensor_02_num'] = df['sensor_02_num'].astype(int)
    
    df['sensor_sum1'] = (df['sensor_00'] + df['sensor_09'] + df['sensor_06'] + df['sensor_01'])
    df['sensor_sum2'] = (df['sensor_01'] + df['sensor_11'] + df['sensor_09'] + df['sensor_06'] + df['sensor_00'])
    df['sensor_sum3'] = (df['sensor_03'] + df['sensor_11'] + df['sensor_07'])
    df['sensor_sum4'] = (df['sensor_04'] + df['sensor_10'])
    
    return df 

# train = create_new_features (train)
# test = create_new_features (test)

In [None]:
added_cols = [col for col in train_pivoted.columns if col not in sensor_cols and col not in ["sequence", "subject","subject", "step"]]

In [None]:
print("train shape:",train_pivoted.shape)
print("test shape:",test_pivoted.shape)

gc.collect()

## Tsflex

In [None]:
settings = ComprehensiveFCParameters()  # all the tsfresh features
del settings["linear_trend_timewise"]  # requires a time-index

#settings = MinimalFCParameters()  # small subset of tsfresh features


fc = FeatureCollection(
    MultipleFeatureDescriptors(
        functions=tsfresh_settings_wrapper(settings), 
        series_names=sensor_cols,
        windows=60,
        strides=60
    )
)

print("features to be extracted:", settings)

In [None]:
print("Extracting features on the training data")
train_feats = fc.calculate(train_original, show_progress=True, return_df=True, window_idx="begin")
train_feats = train_feats.merge(train_pivoted.set_index(train_feats.index), left_index=True, right_index=True)

print("Extracting features on the testing data")
test_feats = fc.calculate(test_original, show_progress=True, return_df=True, window_idx="begin")
test_feats = test_feats.merge(test_pivoted.set_index(test_feats.index), left_index=True, right_index=True)

In [None]:
print("train shape:",train_feats.shape)
print("test shape:",test_feats.shape)

gc.collect()

# Catch 22

In [None]:
fc = FeatureCollection(
    MultipleFeatureDescriptors(
        functions=catch22_wrapper(catch22.catch22_all), 
        series_names=sensor_cols,
        windows=60,
        strides=60
    )
)

In [None]:
train22= fc.calculate(train_original, show_progress=True, return_df=True, window_idx="begin")
test22= fc.calculate(test_original, show_progress=True, return_df=True, window_idx="begin")

train_feats = pd.concat([train_feats,train22], axis =1)
test_feats = pd.concat([test_feats,test22], axis =1)

In [None]:
print("train shape:",train_feats.shape)
print("test shape:",test_feats.shape)

gc.collect()

In [None]:
train_feats.to_csv("train_feats_nopowershap.csv")
test_feats.to_csv("test_feats_nopowershap.csv")