# TPS-Jul 21 -- Data Exploration

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Environment Setup

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from datetime import datetime

%matplotlib inline

## Data Import

In [None]:
train=pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv")
test=pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv")
train.date_time=pd.to_datetime(train.date_time,format="%Y-%m-%d %H:%M:%S")
test.date_time=pd.to_datetime(test.date_time,format="%Y-%m-%d %H:%M:%S")

train.head()

In [None]:
test.head()

## Data Exploration

In [None]:
x_vars=['deg_C', 'relative_humidity', 'absolute_humidity','sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']
targets=['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
target_names=['Carbon_Monoxide', 'Benzene', 'Nitrogen_Oxides']

In [None]:
def date_feature_creation(data):
    data.loc[:,"hour"]=data.loc[:,'date_time'].dt.hour
    data.loc[:,"day"]=data.loc[:,'date_time'].dt.day
    data.loc[:,"day_of_year"]=data.loc[:,'date_time'].dt.dayofyear
    data.loc[:,"day_of_week"]=data.loc[:,'date_time'].dt.dayofweek
    data.loc[:,"month"]=data.loc[:,'date_time'].dt.month
    data.loc[:,"week_of_year"]=data.loc[:,'date_time'].dt.isocalendar().week.astype('int')
    data.loc[:,"quarter"]=data.loc[:,'date_time'].dt.quarter
    return data
    
train_copy=train.copy() 
train_copy=date_feature_creation(train_copy)
train_copy

## Baisc Pairplot

In [None]:
g = sns.PairGrid(train[x_vars])
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot,legend=False)
g.map_diag(sns.histplot,legend=False)

In [None]:
var_in_data=x_vars+targets
corr = train[var_in_data].corr()
plt.figure(figsize=(16, 6))
heatmap = sns.heatmap(corr, vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

## Variable Specific analysis

In [None]:
def var_plot(train,var):
    data=train.copy()
    data=data.loc[:,["date_time",var]]
    
    f, axe = plt.subplots(4, 1, figsize=(20, 20), sharex=False)
    sns.lineplot(
        data=data,
        x="date_time", y=var,
        color='red', ax=axe[0]
    ).set(title='Orginal Time Series')
    
    
    data_MA_day=data.copy()
    data_MA_day.loc[:,var]=data_MA_day.loc[:,var].rolling(24).mean()
    
    sns.lineplot(
        data=data_MA_day,
        x="date_time", y=var,
        color='blue', ax=axe[1]
    ).set(title='1 Day Moving Average Time Series')

    data_MA_week=data.copy()
    data_MA_week.loc[:,var]=data_MA_week.loc[:,var].rolling(24*7).mean()
    
    sns.lineplot(
        data=data_MA_week,
        x="date_time", y=var,
        color='gold', ax=axe[2]
    ).set(title='Weekly Moving Average Time Series')

    data_MA_month=data.copy()
    data_MA_month.loc[:,var]=data_MA_month.loc[:,var].rolling(24*30).mean()
    
    sns.lineplot(
        data=data_MA_month,
        x="date_time", y=var,
        color='green', ax=axe[3]
    ).set(title='Monthly Moving Average Time Series')

    data=date_feature_creation(train.loc[:,["date_time",var]])
    temp_hour=data.groupby("hour").mean()
    temp_day=data.groupby("day").mean()
    temp_day_of_year=data.groupby("day_of_year").mean()
    temp_day_of_week=data.groupby("day_of_week").mean()
    temp_month=data.groupby("month").mean()
    temp_week_of_year=data.groupby("week_of_year").mean()
    temp_quarter=data.groupby("quarter").mean()

    f, axe = plt.subplots(7, 2, figsize=(40, 40), sharex=False)
    
    sns.lineplot(data=temp_hour,
                x="hour",y=var,
                color='brown', ax=axe[0,0]).set(title='Hourly Average Series')
    
    sns.boxplot(x="hour", y=var,
            data=data,ax=axe[0,1]).set(title='Hourly Boxplot Series')
    #----------------------------#
    
    sns.lineplot(data=temp_day,
                x="day",y=var,
                color='red', ax=axe[1,0]).set(title='Per Day Average Series Over Month')
    
    sns.boxplot(x="day", y=var,
            data=data,ax=axe[1,1]).set(title='Per Day Boxplot Series Over Month')
    #----------------------------#

    sns.lineplot(data=temp_day_of_year,
                x="day_of_year",y=var,
                color='blue', ax=axe[2,0]).set(title='Per Day Average Series Over Year')
    
    sns.boxplot(x="day_of_year", y=var,
            data=data,ax=axe[2,1]).set(title="Per Day Boxplot Series Over Year")
    #----------------------------#

    sns.lineplot(data=temp_day_of_week,
                x="day_of_week",y=var,
                color='gold', ax=axe[3,0]).set(title="Per Day Average Series Over Week")
    
    sns.boxplot(x="day_of_week", y=var,
            data=data,ax=axe[3,1]).set(title="Per Day Boxplot Series Over Week")
    #----------------------------#

    sns.lineplot(data=temp_month,
                x="month",y=var,
                color='black', ax=axe[4,0]).set(title="Per Month Average Series Over Year")
    
    sns.boxplot(x="month", y=var,
            data=data,ax=axe[4,1]).set(title="Per Month Boxplot Series Over Year")
    #----------------------------#
    
    sns.lineplot(data=temp_week_of_year,
                x="week_of_year",y=var,
                color='green', ax=axe[5,0]).set(title="Per Week Average Series Over Year")
    
    sns.boxplot(x="week_of_year", y=var,
            data=data,ax=axe[5,1]).set(title="Per Week Boxplot Series Over Year")
    #----------------------------#
    
    sns.lineplot(data=temp_quarter,
                x="quarter",y=var,
                color='peru', ax=axe[6,0]).set(title="Quarterly Average Series")
    
    sns.boxplot(x="quarter", y=var,
            data=data,ax=axe[6,1]).set(title="Quarterly Boxplot Series")
    #----------------------------#
    
    sns.despine(bottom=True)
    plt.setp(f.axes, yticks=[])
    
    plt.show()

### Analysis for the variable Deg_C

In [None]:
var_plot(train,"deg_C")

1. From the above plot we can see that the degree in Celicus increases as the year progresses and in near August it starts decreasesing. So we can say degree in flows this seasonal effect.
2. From the hourly average series we can see, From 10 AM to 3-4 PM degree keeps on increasing and after the it decreases. which is very obvious
3. As we say above that degree in Celicus varies over the year significantly, so Per Day Average of Month is averaging out the effect over year and hence it is not showing clear pattern.
4. Per Day average series over year show clear patther of the above mentioned point and box plot shows that pattern is clear even in first and third quantile.
5. Per Day Average Series over yeek shows that even though Average degree increases as the day progresses in a week, but box plot shows no pattern like that, hence we can conclude that average got effect by extreme values.
6. in the last three plots, we can see the same above mentioned pattern coming.

### Analysis for the variable absolute_humidity

In [None]:
var_plot(train,"relative_humidity")

In similar manner as shown in analysis of degree in celicus, we can analysis above graph for relative humidity
From the above plot we can see relative humidity shows opposite pattern then degree in celcuis, so we can say there is negative relation between Humidity and Degree in celcius, It is evident from the above shown correlation matrix. Correlation is -0.67 

### Analysis for the variable sensor_1

In [None]:
var_plot(train,"sensor_1")

1. From the weekly moving average we can see that there is clear cyclic pattern over month.
2. From Monthly moving average we can see there is seasonal effect over year.
3. From Hourly Average series we can see values from sensor 1 is coming out higher in working hours and lower values coming in non working hours.
4. In Per day average series over week graph we can see that average is lower in weekends. Same petten is there in boxplot graph although not very clear but it is there.
5. last three plot shows same pattern over year that value declines as year progresses and start increasing after 3rd quarter.

### Analysis for the variable sensor_2

In [None]:
var_plot(train,"sensor_2")

1. From weekly moving average time series we can see there is cyclic pattern sensor 2
2. From montly moving average time series we can see values similar for majority of the month but as the year end comes the value start increasing in 3rd quarter.
3. Hourly Average series looks similar to the sensor 1.
4. Per day average series over week graph shows that value decreases as weekend comes near and significantly increases after weekend.
5. Correlation between sensor 2 and sensor 1 is 0.81. Hence we can conclude they are very similar in pattern at hour level but as we noticed above there is some differences.

### Analysis for the variable sensor_3

In [None]:
var_plot(train,"sensor_3")

1. From Weekly and montly Moving average time series we can see there is cyclic and seasonal patter in this variable.
2. In Hourly average series, per day average over week and in last three graphs, we can see pattern opposite to sensor 1 and 2.
3. Correlation of sensor 3 with sensor 1 and sensor 2 is -0.59 and -0.82 respectively.

### Analysis for the variable sensor_4

In [None]:
var_plot(train,"sensor_4")

1. From Weekly and montly Moving average time series we can see there is cyclic and seasonal patter in this variable.
2. In Hourly average series, per day average over week, we can see pattern similar to sensor 1 and 2.
3. In last three graphs we can see pattern opposite to sensor 1 and 2 but similar to sensor 3.
4. Correlation of sensor 3 with sensor 1, sensor 2 and sensor 3 is 0.64, 0.81 and -0.74 respectively.

### Analysis for the variable sensor_5

In [None]:
var_plot(train,"sensor_5")

Sensor 5 is highly positively correlated to sensor 1, 2 and 4, but Highly negativly correlated to sensor 3.

### Analysis for the variable target_carbon_monoxide

In [None]:
var_plot(train,"target_carbon_monoxide")

From the correlation and above shown plots we can conclude that also sensors are correlated to carbon monoxide but degree and humidity are not much correlated.

### Analysis for the variable target_benzene

In [None]:
var_plot(train,"target_benzene")

Benzene is highly correlated to all the sensors, degree in celcius and absolute humidity. All pattern that appear in above graphs we have seen them in different sensor series. 

### Analysis for the variable target_nitrogen_oxides

In [None]:
var_plot(train,"target_nitrogen_oxides")

Nitrogen oxides is most correlated to sensor sensor 5 and, then with sensor 1 and sensor 2. 

There is very different pattern appear in end of the year, which appear in sensor 5 as well.

## Thank You!