In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")

test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")

print("Train shape = {}\nTest shape = {}\nLabels shape = {}".format(train.shape, test.shape, labels.shape))

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
train_subjects = train[["sequence", "subject"]].drop_duplicates()
test_subjects = test[["sequence", "subject"]].drop_duplicates()

In [None]:
print(train_subjects.shape, test_subjects.shape)

There are 25968 total unique sequences in train data and 12218 unique sequences in test data

In [None]:
print(set(test_subjects["subject"]) & set(train_subjects["subject"]))

print(set(test_subjects["sequence"]) & set(train_subjects["sequence"]))

Subjects in train data are not present in test data.

#### Lets see how mean, std, max and min of sequences relate to the target
As explained in [this](https://www.kaggle.com/code/ambrosm/tpsapr22-eda-which-makes-sense#PCA) notebook, outliers may have significant effect and might give wrong results. So I remove the outliers and look at the signal stats.

In [None]:
outlier_clip = train.apply(lambda x:x.clip(x.quantile(0.02), x.quantile(0.98)) if "sensor" in x else x)
plt.figure(figsize = (18,12))
j = 1
for i in ["mean", "std", "max", "min"]:
    plt.subplot(2,2,j)
    j+=1
    aggr = outlier_clip.drop(["subject", "step"], axis = 1).groupby("sequence", as_index = False).agg(i)
    aggr = aggr.merge(labels, on = "sequence")
    plt.barh([i+0.2 for i in range(13)], abs(aggr.drop(["sequence"], axis = 1).groupby("state").mean().iloc[0,:]), height = 0.4, label = "0")
    plt.barh([i-0.2 for i in range(13)], abs(aggr.drop(["sequence"], axis = 1).groupby("state").mean().iloc[1,:]), height = 0.4, label = "1")
    plt.xscale("log")
    plt.legend()
    plt.title(f"Mean of signal {i}")

plt.show()

* Significant difference in signal means and min values for two classes.
* Not so much in std and max except for a few sensors.


#### Let's see how well these stats do for modelling

In [None]:
train_sensor_data = train.drop(["subject"], axis = 1)

sensor_stats = train_sensor_data.drop(["step"], axis = 1).groupby("sequence").agg(["mean", "std", "max", "min"])
sensor_stats.columns = [i+"_"+j for (i, j) in sensor_stats.columns]
sensor_stats.reset_index(inplace = True)

sensor_stats_label = sensor_stats.merge(labels, on = "sequence", how = "left")

In [None]:
kf = KFold()

In [None]:
acc = []
auc = []
feat_imp = []
for tr_id, te_id in kf.split(sensor_stats_label):
    x = sensor_stats_label.drop(["sequence", "state"], axis = 1)
    y = sensor_stats_label["state"]
    
    train_x, train_y = x.loc[tr_id], y.loc[tr_id].values
    test_x, test_y = x.loc[te_id], y.loc[te_id].values
    
    model = RandomForestClassifier(random_state = 42)
    model.fit(train_x, train_y)
    
    preds = model.predict(test_x)
    
    a = accuracy_score(y_true=test_y, y_pred=preds)
    r = roc_auc_score(test_y, model.predict_proba(test_x)[:,1])
    fi = model.feature_importances_
    acc.append(a)
    auc.append(r)
    feat_imp.append(fi)
    
    print("*", end = '')
    
print("\nAccuracy = ", np.mean(acc), "\nAUC = ", np.mean(auc))

So it looks like these basic signal stats features give pretty good accuracy on the training data. But considering more advance feature engineering and deep learning models give accuracies more than 90%, this model's performance is not so good.

#### Feature importances:

In [None]:
importances = np.mean(feat_imp, axis = 0)
order = importances.argsort()

plt.figure(figsize = (18,6))
plt.bar(range(len(importances)), importances[order], width = 0.8)
plt.xticks(range(len(importances)), sensor_stats_label.drop(["sequence", "state"], axis = 1).columns[order],rotation = 90)
plt.yscale("log")
plt.show()