In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from keras.models import  Sequential
from keras.layers.core import  Reshape , Dense, Flatten, Dropout
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization, Convolution2D , MaxPooling2D, TextVectorization, Embedding, LSTM, Bidirectional
from keras import Model, Input, layers
from keras.metrics import AUC
import tensorflow as tf
from keras import regularizers
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_labels = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv')
train = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2022/test.csv')
train.info()

In [None]:
train.groupby('subject')['sequence'].nunique().describe()

In [None]:
subjects_10_train = train['subject'].unique()[:10]
subjects_10_test = train['subject'].unique()[:10]

In [None]:
SENSORS = ['sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05',
              'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12']

In [None]:
def transform_long(df):
    full_df = df.join(train_labels, on='sequence', lsuffix='_left').drop(['sequence_left'], axis=1)

    full_df['row_id'] = full_df.index
    long_df = pd.wide_to_long(full_df, 'sensor_', i='row_id', j='sensor_id')
    long_df.reset_index(inplace=True)
    long_df.drop('row_id', axis=1, inplace=True)
    long_df.rename(columns={'sensor_': 'sensor_value'}, inplace=True)
    return long_df

In [None]:
long_sample_train = transform_long(train.loc[train['subject'].isin(subjects_10_train), SENSORS + ['sequence', 'subject', 'step']])
long_sample_test = transform_long(test.loc[train['subject'].isin(subjects_10_test), SENSORS + ['sequence', 'subject', 'step']])

In [None]:
fig, ax = plt.subplots(figsize=(15, 7))
sns.barplot(data=long_sample_train, x='sensor_id', y='sensor_value', ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(15, 7))
sns.barplot(data=long_sample_test, x='sensor_id', y='sensor_value', ax=ax)

In [None]:
train.loc[(train['subject'].isin(test['subject'])) | (train['sequence'].isin(test['sequence']))]

In [None]:
train_labels['state'].value_counts()

In [None]:
sensor_00 = train.sample(frac=0.001)[['subject', 'sequence', 'step', 'sensor_00']].join(train_labels, on='sequence', how='left', lsuffix='_left').drop(['step','sequence_left'], axis=1)

In [None]:
sns.barplot(data=sensor_00.loc[sensor_00['subject'].isin(subjects_10_train)], x='subject', y='sensor_00', hue='state')

In [None]:
 sub_mean = pd.wide_to_long(train.groupby('subject')[SENSORS].mean().reset_index(), 'sensor_', i='subject', j='sensor_id').rename(columns={'sensor_': 'sensor_value'})

In [None]:
g = sns.FacetGrid(sub_mean.reset_index(), col="sensor_id", col_wrap=3)
g.map(sns.scatterplot, "subject", "sensor_value")

## Feature Generation

In [None]:
FILTERED_SENSORS = ['sensor_02']
SELECTED_SENSORS = [s for s in SENSORS if s not in FILTERED_SENSORS]

In [None]:
def transform_to_agg_sequence_features(df):
    means = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].mean().reset_index()
    means.set_index(['subject', 'sequence'], inplace=True)
    means.columns = [f'{c}_mean_seq' for c in means.columns]

    medians = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].median().reset_index()
    medians.set_index(['subject', 'sequence'], inplace=True)
    medians.columns = [f'{c}_median_seq' for c in medians.columns]

    mins = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].min().reset_index()
    mins.set_index(['subject', 'sequence'], inplace=True)
    mins.columns = [f'{c}_min_seq' for c in mins.columns]

    maxes = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].max().reset_index()
    maxes.set_index(['subject', 'sequence'], inplace=True)
    maxes.columns = [f'{c}_max_seq' for c in maxes.columns]
    
    stds = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].max().reset_index()
    stds.set_index(['subject', 'sequence'], inplace=True)
    stds.columns = [f'{c}_std_seq' for c in stds.columns]
    
    kurtosis = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].apply(pd.DataFrame.kurt).reset_index()
    kurtosis.set_index(['subject', 'sequence'], inplace=True)
    kurtosis.columns = [f'{c}_kur_seq' for c in kurtosis.columns]
    
    skew = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].apply(pd.DataFrame.skew).reset_index()
    skew.set_index(['subject', 'sequence'], inplace=True)
    skew.columns = [f'{c}_skew_seq' for c in kurtosis.columns]
    
    return pd.concat([means, medians, mins, maxes, stds, kurtosis, skew], axis=1, join='inner')

In [None]:
def transform_to_agg_subject_features(df):
    means = df.groupby(['subject'])[SELECTED_SENSORS].mean().reset_index()
    means.set_index(['subject'], inplace=True)
    means.columns = [f'{c}_mean_sub' for c in means.columns]

    medians = df.groupby(['subject'])[SELECTED_SENSORS].median().reset_index()
    medians.set_index(['subject'], inplace=True)
    medians.columns = [f'{c}_median_sub' for c in medians.columns]

    mins = df.groupby(['subject'])[SELECTED_SENSORS].min().reset_index()
    mins.set_index(['subject'], inplace=True)
    mins.columns = [f'{c}_min_sub' for c in mins.columns]

    maxes = df.groupby(['subject'])[SELECTED_SENSORS].max().reset_index()
    maxes.set_index(['subject'], inplace=True)
    maxes.columns = [f'{c}_max_sub' for c in maxes.columns]
    
    stds = df.groupby(['subject'])[SELECTED_SENSORS].max().reset_index()
    stds.set_index(['subject'], inplace=True)
    stds.columns = [f'{c}_std_sub' for c in stds.columns]
    
    return pd.concat([means, medians, mins, maxes, stds], axis=1, join='inner')

In [None]:
def transform_to_agg_inter_subs_features(df):
    means = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].median().groupby('subject').mean()
    means.columns = [f'{c}_mean_inter_subs' for c in means.columns]

    medians = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].median().groupby('subject').median()
    medians.columns = [f'{c}_median_inter_subs' for c in medians.columns]

    mins = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].median().groupby('subject').min()
    mins.columns = [f'{c}_min_inter_subs' for c in mins.columns]

    maxes = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].median().groupby('subject').max()
    maxes.columns = [f'{c}_max_inter_subs' for c in maxes.columns]
    
    stds = df.groupby(['subject', 'sequence'])[SELECTED_SENSORS].median().groupby('subject').std()
    stds.columns = [f'{c}_std_inter_subs' for c in stds.columns]
    
    return pd.concat([means, medians, mins, maxes, stds], axis=1, join='inner')

In [None]:
agg_seq_features = transform_to_agg_sequence_features(train.drop(FILTERED_SENSORS, axis=1))
agg_sub_features = transform_to_agg_subject_features(train.drop(FILTERED_SENSORS, axis=1))
agg_inter_subs_features = transform_to_agg_inter_subs_features(train.drop(FILTERED_SENSORS, axis=1))

In [None]:
full_train = agg_seq_features.merge(agg_sub_features, left_index=True, right_index=True)\
.merge(agg_inter_subs_features, left_index=True, right_index=True)\
.join(train_labels, on='sequence', lsuffix='_left')

full_sensor_data = full_train.drop(['state', 'sequence'], axis=1)

In [None]:
from sklearn.feature_selection import mutual_info_classif

def make_mi_scores(X, y):
    mi_scores = mutual_info_classif(X, y)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

# scaler = StandardScaler()
# scaled_train_array_for_feature_importance = scaler.fit_transform(full_sensor_data)
# mi_scores = make_mi_scores(pd.DataFrame(data=scaled_train_array_for_feature_importance, columns=full_sensor_data.columns, index=full_sensor_data.index), full_train[['state']])
# mi_scores

In [None]:
scaler = StandardScaler()
scaled_train_array = scaler.fit_transform(full_sensor_data)

In [None]:
scaled_shaped_train_array = scaled_train_array.reshape((scaled_train_array.shape[0] , 17, len(SELECTED_SENSORS), 1))

## Model

In [None]:
model = Sequential([    
    Convolution2D(64, kernel_size=(2,2), padding='Same', activation='relu'),
    Convolution2D(64, kernel_size=(2,2), padding='Same', activation='relu'),
    MaxPooling2D((2,2),  strides=(2, 2)),
    Dropout(0.25),
    
    Flatten(),
    
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [None]:
adam = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=adam, loss='binary_crossentropy',metrics=[AUC(name='auc')])
 
callbacks = [EarlyStopping(patience=20, monitor='val_loss')]

history = model.fit(x=scaled_shaped_train_array, y=full_train['state'], validation_split=0.2, epochs=40, 
                    callbacks=callbacks, 
                    batch_size=64)

## Submission

In [None]:
agg_seq_test = transform_to_agg_sequence_features(test.drop(FILTERED_SENSORS, axis=1))
agg_sub_test = transform_to_agg_subject_features(test.drop(FILTERED_SENSORS, axis=1))
agg_inter_subs_test = transform_to_agg_inter_subs_features(test.drop(FILTERED_SENSORS, axis=1))

agg_test = agg_seq_test.merge(agg_sub_test, left_index=True, right_index=True)\
.merge(agg_inter_subs_test, left_index=True, right_index=True)

X_test = scaler.transform(agg_test)

In [None]:
X_test = X_test.reshape((X_test.shape[0], 17,  len(SELECTED_SENSORS), 1))

In [None]:
preds = model.predict(X_test)

In [None]:
agg_test['state'] = (preds.reshape((len(agg_test),)) > 0.5).astype('int')

In [None]:
agg_test.reset_index()[['sequence', 'state']].to_csv('submit.csv', index=False)