**PLEASE UPVOTE https://www.kaggle.com/code/ambrosm/tpsapr22-best-model-without-nn/notebook**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from cycler import cycler
from IPython.display import display
import datetime
import scipy.stats

from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn.ensemble import HistGradientBoostingRegressor, \
    HistGradientBoostingClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score, roc_curve
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline

pd.set_option('precision', 3)
plt.rcParams['axes.facecolor'] = '#575757'
plt.rcParams['axes.prop_cycle'] = cycler(color=['deeppink'] + \
                                         plt.rcParams['axes.prop_cycle']\
                                         .by_key()['color'][1:])

In [None]:
def display_df(df, head=3):
    print(f'SHAPE: {df.shape}')
    display(df.head(head))

In [None]:
def pre_increment(name, local={}):
    if name in local:
        local[name] += 1
        
        return local[name]
    
    globals()[name] += 1
    
    return globals()[name]

In [None]:
def info_df(df, count, order):
    if count:
        try:
            name = [x for x in globals() if globals()[x] is df][0]
        except IndexError:
            name = ''
        
        order = pre_increment('order')   
        print('=' * 30)
        print(f'{order} INFO_DF {name}:\n')
        display_df(df)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')

In [None]:
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
ss = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')

In [None]:
display_df(train, head=10)

In [None]:
train.loc[30:70, ]

In [None]:
train.sequence.value_counts(sort=False)

In [None]:
train.subject.value_counts(sort=False)

In [None]:
# pd.set_option('display.max_rows', None)

In [None]:
# train[train.subject == 47]

In [None]:
train.step.value_counts(sort=False)

In [None]:
# pd.set_option('display.max_columns', 100)

In [None]:
sensors = [col for col in train.columns if 'sensor_' in col]
train_pivot_0 = train.pivot(index=['sequence', 'subject'], columns='step', values=sensors)
order = 0
info_df(train_pivot_0, 1, order)

In [None]:
# pd.set_option('display.max_columns', None)

In [None]:
# train_pivot_0.loc[0, 'sensor_00']

In [None]:
order = 0

def feature_engineering(df):
    new_df = pd.DataFrame([], index=df.index)
    count = 1
    for sensor in sensors:
        new_df[sensor + '_mean'] = df[sensor].mean(axis=1)
        info_df(new_df, count, order)
        
        new_df[sensor + '_std'] = df[sensor].std(axis=1)
        info_df(new_df, count, order)
        
        new_df[sensor + '_iqr'] = scipy.stats.iqr(df[sensor], axis=1)
        info_df(new_df, count, order)
        
        new_df[sensor + '_sm'] = np.nan_to_num(new_df[sensor + '_std'] / \
                                               new_df[sensor + '_mean'].abs())\
            .clip(-1e30, 1e30)
        info_df(new_df, count, order)
        
        new_df[sensor + '_kurtosis'] = scipy.stats.kurtosis(df[sensor], axis=1)
        info_df(new_df, count, order)
        
        count = 0
        
    count = 1    
    new_df['sensor_02_up'] = (df.sensor_02.diff(axis=1) > 0).sum(axis=1)
    info_df(new_df, count, order)
    
    new_df['sensor_02_down'] = (df.sensor_02.diff(axis=1) < 0).sum(axis=1)
    info_df(new_df, count, order)
    
    new_df['sensor_02_upsum'] = df.sensor_02.diff(axis=1)\
        .clip(0, None).sum(axis=1)
    info_df(new_df, count, order)
    
    new_df['sensor_02_downsum'] = df.sensor_02.diff(axis=1)\
        .clip(None, 0).sum(axis=1)
    info_df(new_df, count, order)
    
    new_df['sensor_02_upmax'] = df.sensor_02.diff(axis=1).max(axis=1)
    info_df(new_df, count, order)
    
    new_df['sensor_02_downmax'] = df.sensor_02.diff(axis=1).min(axis=1)
    info_df(new_df, count, order)
    
    new_df['sensor_02_upmean'] = np.nan_to_num(new_df['sensor_02_upsum'] / \
                                               new_df['sensor_02_up'], posinf=40)
    info_df(new_df, count, order)
    
    new_df['sensor_02_downmean'] = np.nan_to_num(new_df['sensor_02_downsum'] / \
                                                 new_df['sensor_02_down'], neginf=-40)
    info_df(new_df, count, order)
    return new_df

In [None]:
train_pivot = feature_engineering(train_pivot_0)

In [None]:
order = 0
info_df(train_pivot, 1, order)

In [None]:
display_df(train_labels)

In [None]:
train_labels[train_labels['sequence'] == 10611]

In [None]:
train_labels.sequence.value_counts()

In [None]:
train_shuffle = train_pivot.sample(frac=1.0, random_state=1)
labels_shuffle = train_labels.reindex(
    train_shuffle.index.get_level_values('sequence'))

display_df(labels_shuffle)

In [None]:
display_df(train)

In [None]:
labels_shuffle = labels_shuffle[['state']]\
    .merge(train[['sequence', 'subject']].groupby('sequence').min(),
           how='left', 
           on='sequence')
display_df(labels_shuffle)

In [None]:
labels_shuffle = labels_shuffle.merge(
    labels_shuffle.groupby('subject').size().rename('sequence_count'),
    how='left', 
    on='subject')
display_df(labels_shuffle)

In [None]:
train_shuffle['subject_sequence_count'] = labels_shuffle['sequence_count'].values
display_df(train_shuffle)

In [None]:
selected_columns = train_shuffle.columns
len(selected_columns)

In [None]:
ncols = len(train_shuffle.columns) // 13

plt.subplots(15, ncols, sharey=True, sharex=True, figsize=(15, 40))
count = 1
order = 0
for i, col in enumerate(train_shuffle.columns):
    temp = pd.DataFrame({col: train_shuffle[col].values,
                         'state': labels_shuffle.state.values})
    if count:
        info_df(temp, count, order)
    
    temp = temp.sort_values(col)
    temp.reset_index(inplace=True)
    if count:
        info_df(temp, count, order)
    
    plt.subplot(15, ncols, i + 1)
    plt.scatter(temp.index, temp.state.rolling(1000).mean(), s=2)
    plt.xlabel(col)
    plt.xticks([])
    
    count = 0
    
plt.show()

In [None]:
features_drop = ['sensor_05_kurtosis', 'sensor_08_mean',
                    'sensor_05_std', 'sensor_06_kurtosis',
                    'sensor_06_std', 'sensor_03_std',
                    'sensor_02_kurtosis', 'sensor_03_kurtosis',
                    'sensor_09_kurtosis', 'sensor_03_mean',
                    'sensor_00_mean', 'sensor_02_iqr',
                    'sensor_05_mean', 'sensor_06_mean',
                    'sensor_07_std', 'sensor_10_iqr',
                    'sensor_11_iqr', 'sensor_12_iqr',
                    'sensor_09_mean', 'sensor_02_sm',
                    'sensor_03_sm', 'sensor_05_iqr', 
                    'sensor_06_sm', 'sensor_09_iqr', 
                    'sensor_07_iqr', 'sensor_10_mean']
selected_columns = [f for f in selected_columns if f not in features_drop]
len(selected_columns)

In [None]:
estimator = HistGradientBoostingClassifier(learning_rate=0.05, 
                                           max_leaf_nodes=25,
                                           max_iter=1000,
                                           min_samples_leaf=500,
                                           l2_regularization=1,
                                           max_bins=255,
                                           random_state=4,
                                           verbose=0)
X, y = train_shuffle[selected_columns], labels_shuffle.state
n_iterations, backward = 48, False

if n_iterations != 0:
    n_features = X.shape[1]
    current_mask = np.zeros(shape=n_features, dtype=bool)
    history = []
    
    for _ in range(n_iterations):
        candidate_feature_indices = np.flatnonzero(~current_mask)
        scores = {}
        for feature_idx in candidate_feature_indices:
            candidate_mask = current_mask.copy()
            candidate_mask[feature_idx] = True
            X_new = X.values[:, ~candidate_mask if backward else candidate_mask]
            scores[feature_idx] = cross_val_score(
                estimator, 
                X_new,
                y, 
                cv=GroupKFold(n_splits=5), 
                groups=train_shuffle.index.get_level_values('subject'),
                scoring='roc_auc',
                n_jobs=-1
            ).mean()
            
        new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
        current_mask[new_feature_idx] = True
        history.append(scores[new_feature_idx])
        
        new = 'Deleted' if backward else 'Added'
        print(f'{new} FEATURE: {str(X.columns[new_feature_idx]):30}'
              f' {scores[new_feature_idx]:.3f}')
    
    print()
    plt.figure(figsize=(12, 6))
    plt.scatter(np.arange(len(history)) + (0 if backward else 1), history)
    plt.ylabel('AUC')
    plt.xlabel('FEATURES REMOVED' if backward else 'FEATURES ADDED')
    plt.title('SEQUENTIAL FEATURE SELECTION')
    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
    
    plt.show()
    
    if backward:
        current_mask = ~current_mask
    selected_columns = np.array(selected_columns)[current_mask]
    print(selected_columns)

In [None]:
%%time

print(f'{len(selected_columns)} FEATURES')
score_list = []
kf = GroupKFold(n_splits=5)

for fold, (idx_tr, idx_va) in enumerate(kf.split(
    train_shuffle, groups=train_shuffle.index.get_level_values('subject'))):
    X_tr = train_shuffle.iloc[idx_tr][selected_columns]
    X_va = train_shuffle.iloc[idx_va][selected_columns]
    
    y_tr = labels_shuffle.iloc[idx_tr].state
    y_va = labels_shuffle.iloc[idx_va].state
    
    model = HistGradientBoostingClassifier(learning_rate=0.05,
                                           max_leaf_nodes=25,
                                           max_iter=1000, 
                                           min_samples_leaf=500,
                                           l2_regularization=1,
                                           validation_fraction=0.05,
                                           max_bins=63,
                                           random_state=3, 
                                           verbose=0)
    
#     model = XGBClassifier(n_estimators=500, n_jobs=-1,
#                           eval_metric=['logloss'],
#                           #max_depth=10,
#                           colsample_bytree=0.8,
#                           #gamma=1.4,
#                           reg_alpha=6, reg_lambda=1.5,
#                           tree_method='hist',
#                           learning_rate=0.03,
#                           verbosity=1,
#                           use_label_encoder=False, random_state=3)

    if True or type(model) != XGBClassifier:
        model.fit(X_tr.values, y_tr)
    else:
        model.fit(X_tr.values,
                  y_tr,
                  eval_set=[(X_va.values, y_va)],
                  eval_metric=['auc'],
                  early_stopping_rounds=30,
                  verbose=10)
    try:
        y_va_pred = model.decision_function(X_va.values)
    except AttributeError:
        try:
            y_va_pred = model.predict_proba(X_va.values)[:, 1]
        except AttributeError:
            y_va_pred = model.predict(X_va.values)
      
    score = roc_auc_score(y_va, y_va_pred)
    
    try:
        print(f'FOLD {fold}: n_iter ={model.n_iter_:5d}    AUC = {score:.3f}')
    except AttributeError:
        print(f'FOLD {fold}:             AUC = {score:.3f}')
        
    score_list.append(score)
    
    print(f'OOF AUC:                        {np.mean(score_list):.3f}')

In [None]:
def plot_roc_curve(y_va, y_va_pred):
    plt.figure(figsize=(8, 8))
    fpr, tpr, _ = roc_curve(y_va, y_va_pred)
    plt.plot(fpr, tpr, color='r', lw=2)
    plt.plot([0, 1], [0, 1], color='lime', lw=1, linestyle='--')
    plt.gca().set_aspect('equal')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('FALSE POSITIVE RATE')
    plt.ylabel('TRUE POSITIVE RATE')
    plt.title('RECEIVER OPERATING CHARACTERISTIC')
    
    plt.show()
    
plot_roc_curve(y_va, y_va_pred)

In [None]:
display_df(test)

In [None]:
test_pivot_0 = test.pivot(index=['sequence', 'subject'], columns='step', values=sensors)
display_df(test_pivot_0)

In [None]:
test_pivot = feature_engineering(test_pivot_0)
display_df(test_pivot)

In [None]:
sequence_count = test_pivot.index.to_frame(index=False)\
    .groupby('subject').size().rename('subject_sequence_count')
sequence_count

In [None]:
display_df(ss)

In [None]:
submission = pd.DataFrame({'sequence': test_pivot.index.get_level_values('sequence')})
display_df(submission)

In [None]:
test_pivot = test_pivot.merge(sequence_count, how='left', on='subject')
display_df(test_pivot)

In [None]:
print(f'{len(selected_columns)} FEATURES')
pred_list = []
for seed in range(100):
    X_tr = train_shuffle[selected_columns]
    y_tr = labels_shuffle.state
    
    model = HistGradientBoostingClassifier(learning_rate=0.05,
                                           max_leaf_nodes=25,
                                           max_iter=1000,
                                           min_samples_leaf=500,
                                           validation_fraction=0.05,
                                           l2_regularization=1,
                                           max_bins=63,
                                           random_state=seed,
                                           verbose=0)
    model.fit(X_tr.values, y_tr)
    pred_list.append(
        scipy.stats.rankdata(
            model.decision_function(
                test_pivot[selected_columns].values)))
    print(f'{seed:2}', pred_list[-1])
    
print()
submission['state'] = sum(pred_list) / len(pred_list)
submission.to_csv('hist_gradient__.csv', index=False)
submission