In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

import optuna

from tqdm import tqdm
tqdm.pandas()

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU
from tensorflow.keras.layers import Bidirectional, Multiply

from xgboost import XGBClassifier
import lightgbm as lgb

np.random.seed(2022)
tf.random.set_seed(2022)

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
t_lbls = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')

# EDA

In [None]:
fig = plt.figure(figsize = (15, 10))
for i, sensor in enumerate(train.columns.tolist()[3:]):
    plt.subplot(3,6,i+1)
    plt.title(sensor, size = 13)
    a = sns.kdeplot(train[sensor], color='#c21b1b', linewidth = 0.6)
    sns.kdeplot(test[sensor], color='#21a5de', linewidth = 0.6)
    for j in ['right', 'left', 'top', 'bottom']:
        a.spines[j].set_visible(False)
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks(size=8)
    plt.yticks([])
    
plt.figtext(0.5, 1.05, 'Ditribution of sensors', size = 23, ha = 'center')
plt.figtext(0.48, 1.01, 'Train', size = 15, ha = 'center', color='#c21b1b')
plt.figtext(0.52, 1.01, 'Test', size = 15, ha = 'center', color='#21a5de')
fig.tight_layout(pad = 3)
plt.show()

In [None]:
matrix = np.triu(train[train.columns.tolist()[3:]].corr())
plt.figure(figsize = (14, 14))
plt.title('Correlation between sensors', size = 23)
a = sns.heatmap(train[train.columns.tolist()[3:]].corr(), annot = True, cmap = 'Blues', 
            mask = matrix, vmin = -0.2, vmax = 0.6, linewidths = 0.2, linecolor = 'white', cbar = False)
a.set_xticklabels(list('s_'+str(i) for i in range(13)))
a.set_yticklabels(list('s_'+str(i) for i in range(13)))
plt.xticks(size = 12)
plt.yticks(size = 12)
plt.show()

In [None]:
seqs, i = list(t_lbls[t_lbls['state']==0]['sequence'][:3]) + list(t_lbls[t_lbls['state']==1]['sequence'][:3]), 0
colors = ['#c21b1b', '#c21b1b', '#c21b1b', '#21a5de', '#21a5de', '#21a5de']
fig = plt.figure(figsize = (15, 20))
for sensor in train.columns.tolist()[3:]:
    for color, seq in zip(colors, seqs):
        i += 1
        plt.subplot(13,6,i)
        sns.set_style("white")
        if i < 7: 
            plt.title(f"Sequence {seq}", size = 12, fontname = 'monospace')
        a = sns.lineplot(data=train[train['sequence']==seq][sensor], color = color, linewidth = 1)
        plt.xlabel('')
        plt.ylabel('')
        if (i-1) % 6 == 0: 
            plt.ylabel(sensor, size = 12, fontname = 'monospace')
        plt.xticks([])
        plt.yticks([])
        
fig.tight_layout(h_pad = 3)

plt.figtext(0.5, 1.04, 'Sequences examples', fontsize = 23, fontname = 'monospace', ha='center')
plt.figtext(0.22, 1.02, 'Target 0', fontsize = 20, fontname = 'monospace', color = '#c21b1b')
plt.figtext(0.71, 1.02, 'Target 1', fontsize = 20, fontname = 'monospace', color = '#21a5de')

plt.show()

In [None]:
def color(x):
    if x <= 0.25:
        return 0
    elif x >= 0.75:
        return 0.5
    else:
        return 1

sub_stat = t_lbls.merge(train[['sequence', 'subject']], on='sequence', how='left')\
.drop_duplicates().groupby('subject').agg({'state':['mean', 'count']}).reset_index()
sub_stat.columns = sub_stat.columns.map('_'.join)
sub_stat['text'] = 'Subject - <b>' + sub_stat["subject_"].astype('str') + \
'</b> <br>State - <b>' + round(sub_stat["state_mean"], 2).astype('str') + \
'</b> <br>Count - <b>' + sub_stat["state_count"].astype('str') + '</b> <extra></extra>'


fig = go.Figure()
fig.add_trace(go.Scatter( 
    x = sub_stat['subject_'], 
    y = sub_stat['state_mean'],
    mode = 'markers',
    marker=dict(
        size=sub_stat['state_count']*0.3,
        color=((sub_stat['state_mean'].apply(lambda x: color(x)))),
        colorscale=[[0, '#c21b1b'], [0.5, '#21a5de'], [1, '#ffdc2b']],
        line=dict(width=0.1, color='black')
    ),
    hovertemplate = sub_stat['text']
))

fig.update_layout(width = 1150, height=600, plot_bgcolor = 'white', title = 'Subject states', 
                  title_font_size = 27, title_x = 0.5, title_y = 0.9,
                  font_family="Calibri", font_color="black")

fig.update_yaxes(title_text='Mean state', showline = True, linecolor = '#f5f2f2', 
                 showgrid = True, gridwidth = 1, gridcolor = '#f5f2f2',
                 linewidth = 2, tickfont_size = 12, tickvals=[0.0, 0.25, 0.50, 0.75, 1.0])

fig.update_xaxes(title_text='Subject', showline = True, linecolor = '#f5f2f2')

fig.show()

In [None]:
x=t_lbls.groupby(['state'])['state'].count()
y=len(t_lbls)
r=((x/y)).round(4)

mf_ratio = pd.DataFrame(r).T

fig, ax = plt.subplots(1,1,figsize=(19, 2))
plt.title('Target distribution', size=23, y=1.05)

ax.barh(mf_ratio.index, mf_ratio[0], 
        color='#c21b1b', alpha=0.9, label='Zero')
ax.barh(mf_ratio.index, mf_ratio[1], left=mf_ratio[0], 
        color='#21a5de', alpha=0.9, label='One')

ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])

ax.annotate(f"{mf_ratio[0]['state']*100}%", 
                   xy=(mf_ratio[0]['state']/2, 'state'),
                   va = 'center', ha='center',fontsize=40,
                   color='white')

ax.annotate('Target "0"', xy=(mf_ratio[0]['state']/2, -0.25),
                   va = 'center', ha='center',fontsize=15,
                   color='white')
    
ax.annotate(f"{mf_ratio[1]['state']*100}%", 
                   xy=(mf_ratio[0]['state']+mf_ratio[1]['state']/2, 'state'),
                   va = 'center', ha='center',fontsize=40,
                   color='white')
    
ax.annotate('Target "1"', xy=(mf_ratio[0]['state']+mf_ratio[1]['state']/2, -0.25),
                   va = 'center', ha='center',fontsize=15,
                   color='white')

for i in ['top', 'left', 'right', 'bottom']:
    ax.spines[i].set_visible(False)
    
plt.show()

# Preprocessing

The first and most obvious features that came to mind in my first [notebook](https://www.kaggle.com/code/dmitryuarov/tps-sensors-2xlstm-xgb-auc-0-976) at the beginning of the competition were lags and differences between steps. After experimenting, first, I found new features that allowed me to get better mean val AUC on 12 folds: *rolling* + *mean/std/sum*. I also tried another functions in rolling, like *var*, *min*, *max* and *expanding* window function, but they only made the result worse. Then I added 3 experemental features because of ~0.5+ correlation between sensors.

And the main thing, that turned out to be very important - correlation between state and count of sequences that the subject had. As you can see on my plot, subjects, who had more than ~95 sequences, were more likely to get target "1" and subjects, who had less than ~25 sequences, were more likely to get target "0". Therefore, dividing the sequences into three groups turned out to be a great idea. Although this improved the data, but in real life this would not have worked, because most likely we would have predicted the result of one sequence for one subject, therefore the feature about the number of sequences would have been useless.

In more details, how the new features affected, I wrote below.

In [None]:
features = train.columns.tolist()[3:]

def sub_imp(x):
    if x < 25:
        return 0
    elif x > 95:
        return 2
    else:
        return 1

def prep(df):
    for feature in features:
        df[feature+'_lag1'] = df.groupby('sequence')[feature].shift(1)
        df[feature+'_back_lag1'] = df.groupby('sequence')[feature].shift(-1)
        
        df.fillna(0, inplace=True)
        df[feature+'_diff1'] = df[feature] - df[feature+'_lag1']
        
        # New features
        for window in [3,6,12]:
            df[feature+'_roll_'+str(window)+'_mean'] = df.groupby('sequence')[feature]\
            .rolling(window=window, min_periods=1).mean().reset_index(level=0,drop=True)
            
            df[feature+'_roll_'+str(window)+'_std'] = df.groupby('sequence')[feature]\
            .rolling(window=window, min_periods=1).std().reset_index(level=0,drop=True)
            
            df[feature+'_roll_'+str(window)+'_sum'] = df.groupby('sequence')[feature]\
            .rolling(window=window, min_periods=1).sum().reset_index(level=0,drop=True)
        
    # Experemental features
    df['sens_00_06'] = df['sensor_00'] * df['sensor_06']
    df['sens_03_07'] = df['sensor_03'] * df['sensor_07']
    df['sens_03_11'] = df['sensor_03'] * df['sensor_11']

    for feature in ['sens_00_06', 'sens_03_07', 'sens_03_11']:
        df[feature + '_lag1'] = df.groupby('sequence')[feature].shift(1)
    df.fillna(0, inplace=True)
    
    # Subject feature
    sub_stat = df[['sequence', 'subject']].drop_duplicates().groupby('subject').agg({'sequence': 'count'})\
    .rename(columns={'sequence': 'count'}).reset_index()
    df = df.merge(sub_stat, on='subject', how='left')
    df['sub_imp'] = df['count'].apply(lambda x: sub_imp(x))
    df.drop('count', axis=1, inplace=True)
     
prep(train)
prep(test)

features = train.columns.tolist()[3:]
sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

groups = train["sequence"]
labels = t_lbls["state"]

train = train.drop(["sequence", "subject", "step"], axis=1).values
train = train.reshape(-1, 60, train.shape[-1])

test = test.drop(["sequence", "subject", "step"], axis=1).values
test = test.reshape(-1, 60, test.shape[-1])

# LSTM 

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 256
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")
    
def hist_plot(history):
    fig = plt.figure(figsize = (5, 3))
    plt.plot(history.history['auc'])
    plt.plot(history.history['val_auc'])
    plt.grid(color = 'gray', linestyle = '-', axis = 'both', linewidth=0.5, visible=0.5)
    plt.plot(history.history['val_auc'].index(max(history.history['val_auc'])), 
         max(history.history['val_auc']), 'ko', markersize = 5,
             fillstyle = 'full', color = 'r')
    plt.title('model training')
    plt.ylabel('AUC')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='lower right')
    plt.show()

In [None]:
def dnn_model():

    x_input = Input(shape=(train.shape[-2:]))
    x1 = Bidirectional(LSTM(768, return_sequences=True))(x_input)
        
    x21 = Bidirectional(LSTM(512, return_sequences=True))(x1)
    x22 = Bidirectional(LSTM(512, return_sequences=True))(x_input)
    l2 = Concatenate(axis=2)([x21, x22])
        
    x31 = Bidirectional(LSTM(384, return_sequences=True))(l2)
    x32 = Bidirectional(LSTM(384, return_sequences=True))(x21)
    l3 = Concatenate(axis=2)([x31, x32])
        
    x41 = Bidirectional(LSTM(256, return_sequences=True))(l3)
    x42 = Bidirectional(LSTM(128, return_sequences=True))(x32)
    l4 = Concatenate(axis=2)([x41, x42])
        
    l5 = Concatenate(axis=2)([x1, l2, l3, l4])
    g = GlobalMaxPooling1D()(l5)
    x7 = Dense(128, activation='selu')(g)
    d = Dropout(0.05)(x7)
    x_output = Dense(units=1, activation="sigmoid")(d)
    
    model = Model(inputs=x_input, outputs=x_output, name='lstm_model')
    
    return model

model = dnn_model()

plot_model(
    model, 
    show_shapes=True,
    show_layer_names=True
)

I use strict learning conditions within which, if the val score has not increased in the epoch, then I reduce the learning rate by half. As the practice of learning from the available data has shown, this is permissible and significantly speeds up the learning process.

In [None]:
with tpu_strategy.scope():
    VERBOSE = False
    predictions, scores = [], []
    k = GroupKFold(n_splits = 12)

    for fold, (train_idx, val_idx) in enumerate(k.split(train, labels, groups.unique())):
        print('-'*17, '>', f'Fold {fold+1}', '<', '-'*17)
    
        X_train, X_val = train[train_idx], train[val_idx]
        y_train, y_val = labels.iloc[train_idx].values, labels.iloc[val_idx].values
        
        model = dnn_model()
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics='AUC')

        lr = ReduceLROnPlateau(monitor="val_auc", factor=0.5, 
                               patience=1, verbose=VERBOSE, mode="max")

        es = EarlyStopping(monitor="val_auc", patience=3, 
                           verbose=VERBOSE, mode="max", 
                           restore_best_weights=True)
        
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        chk_point = ModelCheckpoint(f'./TPS_model_2022_{fold+1}C.h5', options=save_locally, 
                                    monitor='val_auc', verbose=VERBOSE, 
                                    save_best_only=True, mode='max')
        
        training = model.fit(X_train, y_train, 
                  validation_data=(X_val, y_val), 
                  epochs=20,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE, 
                  callbacks=[lr, chk_point, es])
        
        hist_plot(training)
        
        load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
        model = load_model(f'./TPS_model_2022_{fold+1}C.h5', options=load_locally)
        
        y_pred = model.predict(X_val, batch_size=BATCH_SIZE).squeeze()
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        predictions.append(model.predict(test, batch_size=BATCH_SIZE).squeeze())
        print(f"Fold-{fold+1} | OOF Score: {score}")
    
    print('-'*40)
    print(f'Mean AUC on {k.n_splits} folds - {np.mean(scores)}')

del X_train, X_val, y_train, y_val
ss["state"] = sum(predictions)/k.n_splits 

So, how new features improved the result of the model:

1. Rolling features: *0.968 - 0.970*
2. Experemental features: *0.970 - 0.971*
3. Subject feature: *0.971 -* **0.977**

# Blending and postprocessing

Looking ahead, I will do what many people like to do.

In [None]:
s2 = pd.read_csv('../input/tps-apr/en_blend_0977.csv') # In this submission blending results from my first work (XGB + LSTM with old features)
ss['state'] = ss['state']*0.5 + s2['state']*0.5

The idea of postprocessing is based on the number of sequences that the subjects had. There is a very dangerous point here, so in order not to take too much risk, I have set conditions suitable only for those sequences in which we can be very confident. This is a minor improvement that can improve the result on LB by about 0.0005-0.001.

In [None]:
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
post = ss.merge(test[['sequence', 'subject']], on='sequence', how='left').drop_duplicates()
post = post.merge(post.groupby('subject').agg({'state':'count'})\
                  .reset_index().rename(columns={'state': 'count'}), on='subject', how='left')

plt.title('Basic predictions')
sns.histplot(post['state'])
plt.show()

def repredict(row):
    if row['count'] < 20 and row['state'] < 0.3:
        return 0.0
    elif row['count'] > 100 and row['state'] > 0.7:
        return 1.0
    else:
        return row['state']
    
post['repredict'] = post[['state', 'subject', 'count']].apply(lambda row: repredict(row), axis=1)

print('-'*35)
print(f"{len(ss) - sum(post['repredict'] == post['state'])} predictions have been rounded")
print('-'*35)
print()

plt.title('Postprocessing predictions')
sns.histplot(post['repredict'])
plt.show()

ss = post.drop(['state', 'subject', 'count'], axis=1).rename(columns={'repredict': 'state'})
ss.to_csv('blend_sub31_exp.csv', index=False)
ss

**I think I have added new ideas for new experiments and hope you enjoyed my work, so I will be glad for the upvote :)**