In [None]:
import numpy as np
import pandas as pd
from pandas import DataFrame

import missingno

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

sns.set_theme(style="darkgrid")

In [None]:
data_train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')

In [None]:
data_train.head()

In [None]:
data_train.info()

In [None]:
data_train.isna().sum()

In [None]:
data_train.describe()

In [None]:
data_train.hist(bins=20,figsize=(15,10))
plt.show()

Let's have a look at all the variables and their trend

In [None]:
data_train.plot(subplots=True, figsize=(20,40),color='darkslateblue')

The graph of deg_C and relative humidity apears like a reflection. The graph for target_benzene and target_carbon_monoxide shows noticable resemblance.

## Let us look at the correlation values of the variables.

In [None]:
plt.figure(figsize=(15,10))
correlation = data_train.corr()
sns.heatmap(correlation,annot=True)

In [None]:
correlation['target_benzene'].sort_values(ascending = False)

In [None]:
correlation['target_nitrogen_oxides'].sort_values(ascending = False)

In [None]:
correlation['target_carbon_monoxide'].sort_values(ascending = False)

 **The values show that target_benzene is highly correlated with sensor_2 values while target_carbon_monoxide shows high correlation with target_benzene and target_nitrogen_oxides is correlated to target_carbon_monoxide**

In [None]:
data_train['year']= data_train['date_time'].apply(lambda x: x.split("-")[0])
data_train.head()

In [None]:
data_train.year.unique()

In [None]:
data_train['month']= data_train['date_time'].apply(lambda x: x.split("-")[1])
data_train.head()

In [None]:
data_train.month.unique()

In [None]:
convert_dict = {'year':'int','month':'int'}
data_train= data_train.astype(convert_dict)

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)
fig.suptitle('target_benzene', fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(data_train.loc[data_train["month"]==i,'target_benzene'], color="purple")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

There are some noticable flat areas in the 4th, 6th, 8th and 12th month. We need to figure out what is special about them or maybe they are just garbage values that needs to be fixed before applying ML.

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)
fig.suptitle('target_carbon_monoxide', fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(data_train.loc[data_train["month"]==i,'target_carbon_monoxide'], color="crimson")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=5, figsize=(16, 20))
plt.set_cmap("Set2")
plt.subplots_adjust(hspace = 0.3)
fig.suptitle('target_nitrogen_oxides', fontsize=20)

i=3
for r in np.arange(5):
    for c in [0, 1]:
        axs[r, c].plot(data_train.loc[data_train["month"]==i,'target_nitrogen_oxides'], color="goldenrod")
        axs[r, c].set_title(f"Month #{i}", fontsize=15)
        axs[r, c].legend(fontsize=13)
        i+=1

**Categorizing the months as the names**




In [None]:
monthly = {
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'Aug',
    9: 'Sept',
    10: 'Oct',
    11: 'Nov',
    12: 'Dec',
    1: 'Jan'
}
data_train['month'] = data_train['month'].replace(monthly)
data_train.head()

In [None]:
data_train['target_benzene'].groupby(data_train['month']).mean()

In [None]:
data_train['target_carbon_monoxide'].groupby(data_train['month']).mean()

In [None]:
data_train['target_nitrogen_oxides'].groupby(data_train['month']).mean()

In [None]:
quaterly = {
    'March': 1,
     'April' :1,
    'May':2,
    'June':2,
    'July':2,
    'Aug':2,
    'Sept':3,
    'Oct':3,
    'Nov':3,
    'Dec':3,
    'Jan':1
}
data_train['quarterly'] = data_train['month'].replace(quaterly)
data_train.head()

In [None]:
d =  data_train.drop(columns=['month','year'])
d['date'] = d['date_time'].apply(lambda x: x.split(" ")[0])
d=d.drop(columns='date_time')
d.head()

In [None]:
target=['target_benzene','target_carbon_monoxide','target_nitrogen_oxides']
for t in target:
    my_alpha=0.25
    fig, ax = plt.subplots(figsize=(16,4))
    ax.scatter(data_train.date_time, data_train[t], alpha=my_alpha, 
               color='darkred', s=4)
    ax.xaxis.set_major_locator(plt.MaxNLocator(20)) # reduce number of x-axis labels
    plt.xticks(rotation=90)
    plt.title(t)
    plt.grid()
    plt.show()

In [None]:
    fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))

    sns.scatterplot(data_train['sensor_2'], data_train['target_benzene'], ax=ax[0])
    ax[0].set_title('scatterplot \n sensor_2 vs target_carbon_monoxide', fontsize = 10, loc='center')
    ax[0].set_xlabel('sensor_2', fontsize = 10, fontdict=dict(weight='bold'))
    ax[0].set_ylabel('target_benzene', fontsize = 10, fontdict=dict(weight='bold'))

    sns.scatterplot(data_train['target_benzene'], data_train['target_carbon_monoxide'], ax=ax[1])
    ax[1].set_title('scatterplot \n target_benzene vs target_carbon_monoxide', fontsize = 10, loc='center')
    ax[1].set_xlabel('target_benzene', fontsize = 10, fontdict=dict(weight='bold'))
    ax[1].set_ylabel('target_carbon_monoxide', fontsize = 10, fontdict=dict(weight='bold'))
    
    sns.scatterplot(data_train['target_carbon_monoxide'], data_train['target_nitrogen_oxides'], ax=ax[2])
    ax[2].set_title('scatterplot \n target_carbon_monoxide vs target_nitrogen_oxides', fontsize = 10, loc='center')
    ax[2].set_xlabel('target_carbon_monoxide', fontsize = 10, fontdict=dict(weight='bold'))
    ax[2].set_ylabel('target_nitrogen_oxides', fontsize = 10, fontdict=dict(weight='bold'))

In [None]:
for t in target:
    fig, ax = plt.subplots(figsize=(16, 6))


    sns.lineplot(data_train['month'], data_train[t], palette='mako')
    ax.set_title('Seasonal plot of '+t , fontsize = 20, loc='center', fontdict=dict(weight='bold'))
    ax.set_xlabel('Month', fontsize = 16, fontdict=dict(weight='bold'))
    ax.set_ylabel('Salinity Surface', fontsize = 16, fontdict=dict(weight='bold'))


    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 6))

    sns.scatterplot(data_train['month'], data_train[t], ax=ax[0])
    ax[0].set_title('Month-wise Scatter Plot', fontsize = 15, loc='center', fontdict=dict(weight='bold'))
    ax[0].set_xlabel('month', fontsize = 16, fontdict=dict(weight='bold'))
    ax[0].set_ylabel(t, fontsize = 16, fontdict=dict(weight='bold'))

    sns.boxplot(data_train['month'], data_train[t], ax=ax[1])
    ax[1].set_title('Month-wise Box Plot', fontsize = 15, loc='center', fontdict=dict(weight='bold'))
    ax[1].set_xlabel('Month', fontsize = 16, fontdict=dict(weight='bold'))
    ax[1].set_ylabel(t, fontsize = 16, fontdict=dict(weight='bold'))

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(15, 20), sharex=True)
for name, ax in zip(['sensor_1', 'sensor_2', 'sensor_3', 'sensor_4','sensor_5','target_carbon_monoxide','target_benzene',
                     'target_nitrogen_oxides'], axes):
    sns.boxplot(data = data_train, x='month', y=name, ax=ax)
    ax.set_ylabel("")
    ax.set_title(name)
    if ax != axes[-1]:
        ax.set_xlabel('')