In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.preprocessing import sequence
from sklearn.model_selection import KFold

<center> <h2> Given Data </h2> </center>

***sequence*** -> a unique id for each sequence

***subject*** -> a unique id for the subject in the experiment

***step*** -> time step of the recording, in one second intervals

***sensor_00 - sensor_12*** -> the value for each of the thirteen sensors at that time step.

***What to do*** -> We are having 16 columns in train and test dataset with target column (state) in train_lables. We need to predict the state (train_label) for each sequence.

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

print(f'Train', end='\n')
print(f'Train shape is - {train.shape}')
display(train)
print(f'Test', end='\n')
print(f'Test shape is - {test.shape}')
display(test.head())
print(f'Train_Labels', end='\n')
print(f'Train label shape is - {train_labels.shape}')
display(train_labels.head())

<center> <h2> Storyline/EDA </h2></center>

***What are subjects and steps*** -> We are given the data of an experiment performed on almost 1000 (991 to be precise) experimental participants (subjects). The data collected from each subject is stored in the 13 sensors. For each subject, readings are taken for 1 min (60 sec) at every sec.

***What is a sequence*** -> If only single experiments were taken for each subject then subjects would be equal to sequence. But, here each subjects have multiple experiments. Therefore, unique ID for each experiment is sequence and unique ID for each subject is subject.

In [None]:
# Almost 1000 subjects.
len(train['subject'].unique()) + len(test['subject'].unique())

In [None]:
# Range of step, subject, 
print(f'The step ranges from {train.step.min()} to {train.step.max()}')
print(f'The subject ranges from {train.subject.min()} to {train.subject.max()}')
print(f'The sequence ranges from {train.sequence.min()} to {train.sequence.max()}')

In [None]:
# For each subject we are having multiple sequences.
train[train['subject'] == 0]['sequence'].unique()

In [None]:
print(f'The number of rows given in train dataset is -> Number of sequences * Number of data points taken in each second for 1 min ----- {25968*60}')

**How many times a subject is repeating?**

In [None]:
train['subject'].value_counts().sort_values()/60

#### Sensors ->

In [None]:
list_sensor = train.columns[train.columns.str.contains('sensor')]

plt.figure(figsize=(20, 10))
for i, value in enumerate(list_sensor):
    plt.subplot(4,4,i+1)
    sns.histplot(x=value, data=train, bins = 100, color='limegreen')
    plt.title(value)

In [None]:
train[list_sensor].describe()

**Observations -**

1. It seems like all the sensors have a central tendency of zero, except sensor 2, which differs slightly.
2. Sensor 2 performs very differently from others, also it looks like it has larger variance than others. It might be possible that sensor 2 is collecting the data which is important for the model.

### Correlations ->

In [None]:
plt.figure(figsize = (20, 8))
sns.heatmap(train.corr(), annot=True)

****Observations -** 
1. High correlation is not found between any sensors.
2. Moderate correlation is found between:

    a. (sensor_00, sensor_06), (sensor_00, sensor_09)
    
    b. (sensor_03, sensor_07), (sensor_03, sensor_11)

### Outlier Treatment ->

In [None]:
list_sensor = train.columns[train.columns.str.contains('sensor')]

plt.figure(figsize=(20, 10))
for i, value in enumerate(list_sensor):
    plt.subplot(4,4,i+1)
    sns.boxplot(x=value, data=train, color='limegreen')
    plt.title(value)

In [None]:
Q1 = train.quantile(0.0045)
Q3 = train.quantile(0.9955)
IQR = Q3-Q1
print(IQR)

In [None]:
train_out = train[~((train < (Q1 - 1.5 * IQR)) |(train > (Q3 + 1.5 * IQR))).any(axis=1)]
train_out

**Observation -**

Outlier Treatment - InterQuantileRange
1. Removing outlier decreased our train size significantly (1558080 -> 515505). 
2. The outliers can also carry important information regarding the data. So, whether we should remove outliers or not is very sensitive and depends largely on the domain knowledge.
3. Though for EDA, I have plot the data after outlier treatment below, which shows perfect normal distribution for every sensor.

Update - 
1. The number of sequence decreased from (25967 -> 22007) when we take outliers in inter quantile range (25% - 75%).
2. We should try decreasing threshold value for handling outliers.
3. New weights updated above.

In [None]:
list_sensor = train_out.columns[train_out.columns.str.contains('sensor')]

plt.figure(figsize=(20, 10))
for i, value in enumerate(list_sensor):
    plt.subplot(4,4,i+1)
    sns.histplot(x=value, data=train_out, color='limegreen')
    plt.title(value)

In [None]:
train_out.describe()

**Observation**

Almost every sensor contained outliers. Our data is converted to perfect normal distribution. However when we do model building we should try it with both data.

In [None]:
plt.figure(figsize=(20, 8))
sns.heatmap(train_out.corr(), annot=True)

Observation -
1. Strong Correlation - (sensor_00,sensor_06), (sensor_03,sensor_07)
2. Moderate Correlation - (sensor_00,sensor_07),  (sensor_00,sensor_09), (sensor_01,sensor_06), (sensor_01,sensor_11), (sensor_03,sensor_11), (sensor_03,sensor_06), (sensor_06,sensor_09)

In [None]:
train_out

In [None]:
print(train['sequence'].shape)
print(train_out['sequence'].shape)

In [None]:
print(train['sequence'].nunique())
print(train_out['sequence'].nunique())

In [None]:
count_subject = train_out.groupby(['sequence'])['subject'] \
         .agg(['count']).reset_index() \
         .rename(columns = {'count':'subject_count'})

In [None]:
train_out = train_out.groupby(['sequence']).sum().drop(['subject', 'step'], axis=1).reset_index()
train_out = train_out.merge(count_subject, on=['sequence'], how='left')
train_out = train_out.merge(train_labels, on=['sequence'], how='left')

In [None]:
train_out

In [None]:
X = train_out.iloc[: , 1:-1]
y = train_out['state']

<center> <h3>I will be updating this notebook soon. In the meanwhile, if you found this notebook helpful, please do upvote.</center> </h3>

In [None]:
# epochs= 20
# batch_size= 10
# time_steps= 
# features=  

In [None]:
# kf = KFold(n_splits=10)
# auc = []
# test_preds = []
# for fold, (train_idx, test_idx) in enumerate(kf.split(train, y):
#     print(f"** fold: {fold+1} ** ........training ...... \n")
#     X_train, X_valid = train[train_idx], train[test_idx]
#     y_train, y_valid = y[train_idx], y[test_idx]
    
#     model = Sequential()
#     model.add(LSTM(100, dropout=0.2, input_shape = ()))
#     history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=30, batch_size = 64, callbacks = [es,lr],verbose = False)
    
#     y_pred = model.predict(X_valid).squeeze()
#     auc.append(roc_auc_score(y_valid, y_pred))
#     print(f"auc: {auc[fold]} \n")
#     test_preds.append(model.predict(test).squeeze())
#     plotHist(history)
#     del X_train, X_valid, y_train, y_valid, model, history
#     gc.collect()  