In [None]:
# import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# train data
train_data = pd.read_csv("../input/tabular-playground-series-apr-2022/train.csv")
label = pd.read_csv("../input/tabular-playground-series-apr-2022/train_labels.csv")
# merging train_data and label
train_data = train_data.merge(label, on='sequence', how='outer')
# test data
test_data = pd.read_csv("../input/tabular-playground-series-apr-2022/test.csv")

In [None]:
# display sample of training data
train_data.head()

# Details of columns
**sequence :** Unique ID of each sequence

**subject :** Unique id for the subject of each experiment

**step :** Time step of the recording with one second time interval

**sensor_00 - sensor_12 :** Value for each of the thirteen sensors at a particular time step.

**Our aim :** We will predict the state of each sequence.

In [None]:
print("total no of subjects: ", len(train_data['subject'].unique()))
print("total no of steps: ", len(train_data['step'].unique()))
print("total no of sequence: ", len(train_data['sequence'].unique()))


# Total no of subjects:  672
# Total no of steps:  60
# Total no of sequence:  25,968



# check missing value in training data

In [None]:
print("Total missing value in training data: ")
train_data.isna().sum()


### We don't have missing value in training and test data

# Check Distribution of state(y_label) to find whether it is balanced or not?

In [None]:
# sns.countplot(train_data['state'].value_counts())

sns.set(style="whitegrid")
plt.figure(figsize=(15,6))
total = float(len(train_data))
ax = sns.countplot(x="state", data=train_data)
plt.title('Distribution of State', fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()

### Ouput class(state) is balanced in training dataset

# Distribution of all 13 sensors

In [None]:
sensor_columns = train_data.columns[train_data.columns.str.contains('sensor')]

figure = plt.figure(figsize=(20, 10))
for index, sensor_name in enumerate(sensor_columns):
    plt.subplot(4, 4, index+1)
    plt.hist(train_data[train_data.state.isnull() == False][f"{sensor_name}"], bins=100,color='orange')
    plt.title(f"{sensor_name}")
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle("Distributions of value of sensor_00 - sensor_12", y=1.02)
plt.show()

From the above probability distributions, we can say, 
1. Most of the distributions look symmetric with the center at 0. 
2. Sensor_08 has discrete values, multiple values around 0.1

# 3. Finding outliers in data

In [None]:
figure = plt.figure(figsize=(16,8))
for sensor in range(13):
    sensor_name = f"sensor_{sensor:02d}"
    plt.subplot(4,4,sensor+1)
    plt.hist(train_data[sensor_name],
            bins=100,
            range=([train_data[sensor_name].quantile(0.02), train_data[sensor_name].quantile(0.98)]),color='orange')
    plt.title(f'{sensor_name} Histogram')

figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('Sensor histogram after outlier removal', y=1.02);

# Correlation of Train Features

In [None]:
corr = train_data.corr()
fig, ax = plt.subplots(1,1, figsize = (15,6))

hm = sns.heatmap(train_data.iloc[:,3:-1].corr(),
                ax = ax,
                cmap = 'coolwarm',
                annot = True,
                fmt = '.2f',
                linewidths = 0.05)
fig.subplots_adjust(top=0.93)
fig.suptitle('Correlation Heatmap for Train dataset', 
              fontsize=14, 
              fontweight='bold')

From the above heatmap we can say,
1. we don't have any missing value
2. All the sensors have equal length of time series value
3. sensor 0 and sensor 6 has week co-relation
4. sensor 3 and sensor 7 has week co-relatation

# Time series Visualization of Sensor Data

In [None]:
sensor_cols = ['sensor_'+'%02d'%i for i in range(0, 13)]

fig, axes = plt.subplots(len(sensor_cols), figsize=(16, 32), sharex=True)
for index, col in enumerate(sensor_cols):
    sns.lineplot(data=train_data, x='step', y=col, ax=axes[index])
#     axes[index].text(0.95, 0.9,'smape:'+str(round(smape_score, 2)), 
#                      horizontalalignment='center', 
#                      verticalalignment='center', 
#                      transform = axes[index].transAxes)
#     axes[index].legend(loc='lower right')

# Work in progress...
I'll keep updating this notebook if I'll find more insight of dataset. Feel free to upvote if you like.
This is my first Kaggle Completion. Thanks ❤️
