In [None]:

import numpy as np
import pandas as pd
import random
from scipy.fft import fft, ifft
from scipy import stats
from statsmodels import robust

from IPython.display import display
from cycler import cycler
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Understanding Data

In [None]:
train_labels = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
train =  pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
test = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")

In [None]:
print(train_labels.shape)
print(train.shape)
print(test.shape)

In [None]:
display(train)
display(test)
display(train_labels)

In [None]:
len(train_labels)

Plotting sequences sensor data for some randomly chosen sensors

In [None]:
#global plotting parameters
plt.rcParams.update({"figure.figsize":(12, 8),
                    "axes.prop_cycle": cycler(color=['r', 'm', 'g', 'k','b'])})

## Checking for class balance

In [None]:
px.histogram(train_labels, x="state", color="state",
            title="State Histogram", text_auto=True)

* So we have 12954 examples with state 0 and 13014 with state thats a difference of 60 training samples. The difference is small so we can assume its a balanced class problem.

In [None]:
random.seed(12)
#sensor names
sensors = train.columns[3:] 

#sequences to plot selected from a random choice
sequences_curve = [25963]+random.choices(range(0, len(train_labels)), k=4)

#65 figures
fig, ax=plt.subplots(nrows=13, ncols=5, figsize=(18, 12), sharex=True)
font = {'family':'serif','color':'darkred','size':30}

for i, sensor in enumerate(sensors):
    for j, sequence in enumerate(sequences_curve):
        ax[i][j].plot(range(0, 60), train[train.sequence==sequence][sensor],
                     color=plt.rcParams['axes.prop_cycle'].by_key()['color'][j])
        if i==0:
            state =   train_labels[train_labels.sequence==sequence]["state"].values[0]
            ax[i][j].set_title(f"sequence {sequence} state {state}")
        if j==0:
            ax[i][j].set_ylabel(f"{sensor}")
fig.tight_layout(w_pad=0.2)
plt.suptitle('Time Vs Audio', fontdict=font, y=1.01);

* It seems sequences with state 1 change their amplitutdes rapidly whereas the state 0 sequences have less change in some of the sensor (this might be a selection bias we need to verify this).
* Sensor_02 seems to be the odd one out.
* Sensor_02 takes constant values for certain time intervals
* sensor 12 shows maximum variation in its peaks and troughs


# Measure of location

In [None]:
train[sensors].describe()

In [None]:
def data_location(train):
    tmean_temp = []
    mean_temp= []
    median_temp = []
    for sensor in sensors:
        #trimmed mean
        tmean_temp.append(stats.trim_mean(train[sensor], 0.1))
        #mean
        mean_temp.append(train[sensor].mean())
        #median
        median_temp.append(train[sensor].median())
    
    location = pd.DataFrame({"trimmed_mean 0.1":tmean_temp, "trimmed_mean 0.2":[stats.trim_mean(train[sensor], 0.2) for sensor in sensors],
                             "mean":mean_temp, "median":median_temp}, index=sensors)
    return location
train_loc =  data_location(train)
test_loc = data_location(test)
display(train_loc)
display(test_loc)

In [None]:
px.line(train_loc, title="Central Tendecy")

* Sensor 2 mean is heavily influenced by its outliers when compared to its median, trimmed mean of 10% and 20%.
* Sensor 5, sensor 7, sensor 11 have some deviation from the mean w.r.t median and trimmed mean. It seems these sensors have outliers influencing their mean.



In [None]:
px.line(test_loc, title="Central Tendecy")

# Measure of Dispersion

In [None]:

def dispersion(train):
    std_dev = []
    IQR = []
    mad = []
    for sensor in sensors:
        #sd
        std_dev.append((train[sensor].std()))
        #IQR
        IQR.append(train[sensor].quantile(.75)-train[sensor].quantile(.25))
        #MAD
        mad.append(robust.scale.mad(train[sensor]))
        
    data_dispersion = pd.DataFrame({"Interquartile Range":IQR,
                                        "Median Absolute Deviation":mad,
                                        "Standard Deviation":std_dev},
                                       index=sensors)
    return data_dispersion
train_range = dispersion(train)
test_range = dispersion(test)

In [None]:
px.line(train_range.iloc[:,0:2], title="Dispersion")

We used only MAD and IQR since these are robust and not sensitive to outliers
* IQR gives the range for middle of the dataset. And since median is around 0, most of the data points are clustered in a neighbhorhood of 0 with distance of approximately 0.97.
* Median absolute deviation is simply $\text{median}\{|m-x_1|, \cdots, |m-x_n|\}$ where m is the median. Again MAD is a robust statistical measure for variability.
* Both MAD and IQR show that sensor_12 has maximum variation compared to other sensors.
* Standard deviation is sensitive to outliers and it can be seen for sensor 12 from the `dispersion` dataframe.

In [None]:
px.line(test_range.iloc[:,0:2], title="Dispersion")

In [None]:
with plt.style.context('default'):
    fig_c, ax_c = plt.subplots(nrows=13, ncols=2, figsize=(16, 8), sharex=True)
    for i, sensor in enumerate(sensors):

        #sequence with state 1
        pos = train.loc[train['sequence'].isin(train_labels[train_labels.state==1]["sequence"])][sensor]

        #sequence with state 0
        neg = train.loc[train['sequence'].isin(train_labels[train_labels.state==0]["sequence"])][sensor]

        ax_c[i][0].plot(range(len(pos)),pos, color='g')
        ax_c[i][0].set_ylabel(f"{sensor}", size=8)

        ax_c[i][1].plot(range(len(neg)), neg, color='b')
    fig_c.tight_layout(w_pad=0.2)
    plt.suptitle('State 1 vs State 0', fontdict=font, y=1.01);


* I just plotted it to see whether I can see some anomaly but I don't see much.


# Train and test distribution

In [None]:
with plt.style.context('ggplot'):
    fig_dist= plt.figure(figsize=(12, 12))
    ax_list = []
    for i, sensor in enumerate(sensors):
        ax_list.append(plt.subplot(4,4,i+1))
        ax=ax_list[i]
        ax_list[i%4].set_ylabel("Density")
        sns.kdeplot(train[sensor], fill=True, ax = ax)
        sns.kdeplot(test[sensor], fill=True, ax = ax, color='blue')
    plt.tight_layout()
    plt.show()

* sensor_2 has a skewed distribution in both sets
* All sensors have similar distribution when compared to train and test.
* Initial guess handling sensor_2 well lead to probably better accuracy.