In [None]:
!pip install swifter

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 

import tensorflow as tf
from tensorflow.keras import layers, models,callbacks
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.metrics import roc_auc_score, auc
from sklearn.decomposition import PCA

from scipy.stats import boxcox

#speed up pandas apply
import swifter

In [None]:
train_original = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test_original = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")
train_lables = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
sub= pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv", index_col = 0)

In [None]:
BATCH_SIZE = 256
EPOCHS = 100

In [None]:
DROP_SENSOR = False
PCA_RUN = True

SHIFT_VALUES = False 

# EDA

* Sequence is a unique ID  and the sequence of activity 
* Subject is the person involved in the experiment 
* Step is the timeseries step taken 
* Sensors are the biological measurements taken during the step for each subject 

In each sequence there are 60 time steps for each subject 

Target is seperate to training dataset, this uses the sequence ID as a unique key to join lables and train 

In [None]:
train_lables.head(5)

In [None]:
train_original.head()

In [None]:
train = train_original.merge(train_lables, how = "left", on = "sequence")
train

## Descriptive info

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
print( "Length train: ",len(train))
print( "Length test: ",len(test_original))

In [None]:
train.describe()

# Target EDA

In [None]:
sns.countplot(x= train["state"].value_counts())
plt.title("Target (state) distribution")
plt.show()

We will concatenate test and train 

In [None]:
all_df = pd.concat([train,test_original],axis =0)
all_df

# Sensor data exploration 
## Histograms

### Zoomed in on outliers

In [None]:
plt.figure(figsize=(20,12))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4, 4, sensor+1)
    plt.hist(train[ (train["state"]==1) & 
               (train[sensor_name]<=train[sensor_name].quantile(q= 0.1) )][sensor_name],bins =50,alpha= 0.7)
    plt.hist(train[ (train["state"]==0) & 
               (train[sensor_name]<=train[sensor_name].quantile(q= 0.1) )][sensor_name],bins =50,alpha= 0.7)
    plt.title(f"{sensor_name} histogram")
plt.tight_layout(pad=3.08)
plt.suptitle('Sensor Histograms by target(state) - less than 0.1 quantile')
plt.show()

In [None]:
plt.figure(figsize=(20,12))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4, 4, sensor+1)
    plt.hist(train[ (train["state"]==1) & 
               (train[sensor_name]>=train[sensor_name].quantile(q= 0.9) )][sensor_name],bins =50,alpha= 0.7)
    plt.hist(train[ (train["state"]==0) & 
               (train[sensor_name]>=train[sensor_name].quantile(q= 0.9) )][sensor_name],bins =50,alpha= 0.7)
    plt.title(f"{sensor_name} histogram")
plt.tight_layout(pad=3.08)
plt.suptitle('Sensor Histograms by target(state) - greater than 0.9 quantile')
plt.show()

## Linear plots 
As our data has 'steps' (timeseries) lets see if we can find any changes over time \
We therefore need to groupby timesteps and use an aggregator (i.e. mean, median etc..)  

In [None]:
plt.figure(figsize=(20,12))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4, 4, sensor+1)
    train[train["state"]==1].groupby("step").std()[sensor_name].plot()
    train[train["state"]==0].groupby("step").std()[sensor_name].plot()
    plt.title(f"{sensor_name} lineplot")
plt.tight_layout(pad=4.08)
plt.suptitle('Standard Deviation of sensor data groupby step vs Target (state)')
plt.show()

In [None]:
plt.figure(figsize=(20,12))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4, 4, sensor+1)
    train[train["state"]==1].groupby("step").mean()[sensor_name].plot()
    train[train["state"]==0].groupby("step").mean()[sensor_name].plot()
    plt.title(f"{sensor_name} lineplot")
plt.tight_layout(pad=3.08)
plt.suptitle('Standard Deviation of sensor data groupby step vs Target (state)')
plt.show()

The **mean** and **standard deviation** clear distinction for some sensor columns - we should include this in our features

# Subjects 

In [None]:
print("subjects train:", train["subject"].nunique())
print("subjects test:", test_original["subject"].nunique())

In [None]:
print("Subjects in train and test:")
print ([col for col in train["subject"].unique() if col in test_original["subject"].unique() ])

There are any overlapping subjects between test and train, we can therefore ignore "subject" in training ur models

## Sensor Correlation 

In [None]:
corr_sensor = pd.concat([train, test_original]).iloc[:,3:-1].corr()

In [None]:
plt.figure(figsize= (25,12))
sns.heatmap(corr_sensor, cmap= "Spectral", vmin= -1, vmax= 1, annot = True)
plt.show()

In [None]:
corr_sensor[corr_sensor>=0.45]

# Additional Features

We will include a few additional features and processes

In [None]:
train[train["state"]==1].groupby("step")

In [None]:
mean_vals_0 = train[train["state"]==0].groupby("step")["sensor_02"].mean()
mean_vals_1 = train[train["state"]==1].groupby("step")["sensor_02"].mean()

std_vals_0  = train[train["state"]==1].groupby("step")["sensor_02"].std()
std_vals_1  = train[train["state"]==1].groupby("step")["sensor_02"].std()

#std_vals = train.groupby("step")["sensor_02"].std()
std_vals_1[:5]

### z-score

In [None]:
%%time
def z_score_0(x):
    z_score = (x[1]-  mean_vals_0[mean_vals_0.index ==x[0]].values) / std_vals_0[std_vals_0.index ==x[0]].values
    return z_score[0]

def z_score_1(x):
    z_score = (x[1]-  mean_vals_1[mean_vals_1.index ==x[0]].values) / std_vals_1[std_vals_1.index ==x[0]].values
    return z_score[0]


train["z_score_0"] = train[["step","sensor_02"]].swifter.apply(z_score_0, axis =1 )
test_original["z_score_0"] = test_original[["step","sensor_02"]].swifter.apply(z_score_0, axis =1 )

train["z_score_1"] = train[["step","sensor_02"]].swifter.apply(z_score_1, axis =1 )
test_original["z_score_1"] = test_original[["step","sensor_02"]].swifter.apply(z_score_1, axis =1 )

train

### Shift Values

In [None]:
sensor_cols = [col for col in train.columns if "sensor" in col]

def shift_vals(df):
    for col in sensor_cols:
        df[f"{col}_shift1"] =df.groupby(["sequence","subject"])[col].shift(1).bfill()
        df[col + '_diff1'] = df[col] - df[f"{col}_shift1"]    

    return df

if SHIFT_VALUES:
    print("Shifting")
    shift_vals(train_original)
    shift_vals(test_original)
    train_original

In [None]:
train = train_original.pivot(index = "sequence", columns ="step", values = sensor_cols)
test = test_original.pivot(index = "sequence", columns ="step", values = sensor_cols)

In [None]:
def add_features(df):
    
    for col in sensor_cols:
        df[f"mean_{col}"] = df[col].mean(axis = 1)
        df[f"median_{col}"] = df[col].median(axis = 1)
        df[f"std_{col}"] = df[col].std(axis = 1)
#         df[f"variance_{col}"] = df[col].std(axis = 1)
#         df[f"max_{col}"] = df[col].max(axis = 1)
#         df[f"min_{col}"] = df[col].min(axis = 1)
#         df[f"max-min_{col}"] = df[col].max(axis = 1) - df[col].min(axis = 1)
#         df[f"q50_{col}"] = df[col].quantile(q= 0.5, axis =1)
#         df[f"q25_{col}"] = df[col].quantile(q= 0.25, axis =1) 
#         df[f"q75_{col}"] = df[col].quantile(q= 0.75, axis =1)
#         df[f"q95_{col}"] = df[col].quantile(q= 0.95, axis =1)
#         df[f"q99_{col}"] = df[col].quantile(q= 0.99, axis =1)
#         df[f"skew_{col}"] =df[col].skew( axis =1)
    return df

add_features(train)
add_features(test)

### Drop the base sensor data
We will play around with dropping this data as we have numerous additional features 

In [None]:
if DROP_SENSOR:
    train = train.drop(sensor_cols,axis =1)
    test = test.drop(sensor_cols,axis =1)

In [None]:
num_features = len(train.columns)
print([col for col in train.columns])

# Baseline Model

In [None]:
X = train
y = train_lables["state"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle = False)

In [None]:
scaler= StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
test_s = scaler.transform(test)

In [None]:
def build_model():
    model = models.Sequential()
    model.add(layers.Input(shape=(X.shape[1],)))
    model.add(layers.Dense(600,activation = "selu"))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(600,activation = "selu"))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(600,activation = "selu"))
    model.add(layers.Dense(1,activation = "sigmoid"))
    return model 

model = build_model()

In [None]:
model.summary()

In [None]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics="AUC")
model.fit(X_train_s,y_train,epochs= EPOCHS, 
          callbacks= [callbacks.EarlyStopping(patience=20,monitor='val_loss', mode = "min") ,
                      callbacks.ReduceLROnPlateau(monitor="val_loss",patience = 20, factor= 0.001)],
          validation_data=(X_test_s,y_test), batch_size = BATCH_SIZE)

In [None]:
history = model.history.history
history = pd.DataFrame(history)
history

In [None]:
val_preds = model.predict(X_test_s)
val_preds

In [None]:
train_preds = model.predict(X_train_s)
train_preds

In [None]:
print("Validation AUC:" , roc_auc_score(y_test, val_preds))
print("Intrinsic AUC:", roc_auc_score(y_train, train_preds))

In [None]:
history[["loss","val_loss"]].plot(figsize = (20,8))
plt.title("Training vs Validation Loss")
plt.show()

In [None]:
history[["auc","val_auc"]].plot(figsize = (20,8))
plt.title("Training vs Validation AUC")
plt.show()

# Cross Validation

In [None]:
FOLDS = 5
kfold = GroupKFold(n_splits = FOLDS)

In [None]:
train_original

In [None]:
def cross_val(X,y):
    auc_cv = []
    preds = []

    for fold, (train_idx, val_idx) in enumerate (kfold.split(X,y, groups =train_original.sequence.unique())):

        print("\n","#"*10, f"Fold {fold+1}","#"*10)
        X_train, X_test = X.iloc[train_idx] , X.iloc[val_idx]
        y_train , y_test = y[train_idx], y[val_idx]

        scaler= StandardScaler()
        X_train_s = scaler.fit_transform(X_train)
        X_test_s = scaler.transform(X_test)
        test_s = scaler.transform(test)

        model = build_model()
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics="AUC")
        model.fit(X_train_s,y_train,epochs= EPOCHS, 
                  callbacks= [callbacks.EarlyStopping(patience=10,monitor='val_auc', mode = "max") ,
                              callbacks.ReduceLROnPlateau(monitor="val_auc",patience = 10, factor= 0.001)],
                  validation_data=(X_test_s,y_test), batch_size = BATCH_SIZE)

        auc = roc_auc_score(y_test, (model.predict(X_test_s, batch_size=BATCH_SIZE)))
        print("\n Validation AUC:" , auc)

        auc_cv.append(auc)
        preds.append(model.predict(test_s, batch_size=BATCH_SIZE).squeeze())

    print("FINAL AUC: ", np.mean(auc_cv))
    
    return auc_cv, preds 

auc_cv, preds = cross_val(X,y)

In [None]:
final_preds = np.sum(preds,axis =0)/FOLDS
sub["state"] = final_preds
sub.to_csv("submission.csv")
sub

In [None]:
plt.figure(figsize = (20,8))
sns.histplot(sub["state"])
plt.show()