In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import datetime
import calendar

from matplotlib import pyplot as plt
from plotnine import *
import plotnine as pn

from warnings import filterwarnings
filterwarnings('ignore')

import seaborn as sns

In [None]:
path = '/kaggle/input/tabular-playground-series-jul-2021/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

## 1. Data Review

**Findings:**
* No missing data in both train and test data. How awesome is that!
* Total number of features: 8
* Total training data available: 7,111

In [None]:
data_shape =  {'Train Data': train.shape[0], 'Test Data':test.shape[0]}
data_shape = pd.Series(data_shape,name='Dataset')
data_shape = pd.DataFrame({'Dataset':data_shape.index, 'Records':data_shape.values})

fig = px.bar(data_shape, x="Records", y="Dataset", orientation='h',height=400, width = 800, color = 'Dataset')
fig.show()

In [None]:
print ('Total null values in the train data:', train.isna().sum().sum())
print ('Total null values in the test data: ', test.isna().sum().sum())

## 2. Feature Exploration

### 2.1 Date & Time

**Findings:**
* All three target variables have peak observbations in Nov and Dec, significantly higher than the other months.
* Average and median values are slightly higher in Nov and Dec for Carbon Monoxide and Benzene. Values are significantly higher for Nitrogen Oxides. This indicates that month may be a factor for predicting the targets.
* Average and median values are comparable during the weekdays.Values are lesser for weekends. This paramater can further be converted to weekday and weekend for modeling.
* The target values are lowest between 12 - 5 AM and maximum between 7-9 PM. This looks like a significant factor.

In [None]:
train['date_time'] = pd.to_datetime(train['date_time'], format = '%Y-%m-%d %H:%M:%S')
train['month'] = train['date_time'].dt.month

def findDay(date):
    x = datetime.datetime.strptime(str(date), '%Y-%m-%d %H:%M:%S').weekday()
    return (calendar.day_name[x])
train['weekday'] = train['date_time'].apply(findDay)

train['hour'] = train['date_time'].dt.hour

In [None]:
ggplot(train, aes(x='date_time', y='target_carbon_monoxide')) + geom_line(color='blue', size = 0.2) + labs(x = 'Date', y= 'Carbon Monoxide') \
                                            + theme(figure_size=(16, 4))\
                                            + ggtitle('Carbon Monoxide - Timeseries') + geom_hline(yintercept=7.5, linetype="dashed",color = "black", size=0.5)

In [None]:
ggplot(train, aes(x='date_time', y='target_benzene')) + geom_line(color='red', size = 0.2) + labs(x = 'Date', y= 'Benzene') + theme(figure_size=(16, 4))\
                                            + ggtitle('Benzene - Timeseries') + geom_hline(yintercept=40, linetype="dashed",color = "black", size=0.5)

In [None]:
ggplot(train, aes(x='date_time', y='target_nitrogen_oxides')) + geom_line(color='green', size = 0.2) + labs(x = 'Date', y= 'Nitrogen Oxides') \
                                            + theme(figure_size=(16, 4))\
                                            + ggtitle('Nitrogen Oxides - Timeseries') + geom_hline(yintercept=750, linetype="dashed",color = "black", size=0.5)

In [None]:
co_monthly_avg = train.groupby('month')['target_carbon_monoxide'].mean().reset_index().rename(columns={'target_carbon_monoxide':'values'})
co_monthly_avg['type'] = 'Average'

co_monthly_median = train.groupby('month')['target_carbon_monoxide'].median().reset_index().rename(columns={'target_carbon_monoxide':'values'})
co_monthly_median['type'] = 'Median'

co_df = pd.concat([co_monthly_avg, co_monthly_median])
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

co_df['month_text'] = co_df['month'].map(month_map)

ggplot(co_df, aes(x = 'reorder(month_text,month)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Carbon Monoxide - Monthly Median & Average')\
                    + scale_fill_manual(values=("#21618c","#5dade2"))

In [None]:
ben_monthly_avg = train.groupby('month')['target_benzene'].mean().reset_index().rename(columns={'target_benzene':'values'})
ben_monthly_avg['type'] = 'Average'

ben_monthly_median = train.groupby('month')['target_benzene'].median().reset_index().rename(columns={'target_benzene':'values'})
ben_monthly_median['type'] = 'Median'

ben_df = pd.concat([ben_monthly_avg, ben_monthly_median])
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

ben_df['month_text'] = ben_df['month'].map(month_map)

ggplot(ben_df, aes(x = 'reorder(month_text,month)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Benzene - Monthly Median & Average')\
                    + scale_fill_manual(values=("#943126","#ec7063"))

In [None]:
nit_monthly_avg = train.groupby('month')['target_nitrogen_oxides'].mean().reset_index().rename(columns={'target_nitrogen_oxides':'values'})
nit_monthly_avg['type'] = 'Average'

nit_monthly_median = train.groupby('month')['target_nitrogen_oxides'].median().reset_index().rename(columns={'target_nitrogen_oxides':'values'})
nit_monthly_median['type'] = 'Median'

nit_df = pd.concat([nit_monthly_avg, nit_monthly_median])
month_map = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep',10:'Oct',11:'Nov', 12:'Dec'}

nit_df['month_text'] = nit_df['month'].map(month_map)

ggplot(nit_df, aes(x = 'reorder(month_text,month)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Nitrogen Oxides - Monthly Median & Average')\
                    + scale_fill_manual(values=("#117864","#48c9b0"))

In [None]:
co_weekday_avg = train.groupby('weekday')['target_carbon_monoxide'].mean().reset_index().rename(columns={'target_carbon_monoxide':'values'})
co_weekday_avg['type'] = 'Average'

co_weekday_median = train.groupby('weekday')['target_carbon_monoxide'].median().reset_index().rename(columns={'target_carbon_monoxide':'values'})
co_weekday_median['type'] = 'Median'

weekday_map = {'Monday':0,'Tuesday':1, 'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

co_weekday_df = pd.concat([co_weekday_avg, co_weekday_median])

co_weekday_df['weekday_value'] = co_weekday_df['weekday'].map(weekday_map)

ggplot(co_weekday_df, aes(x = 'reorder(weekday,weekday_value)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Carbon Monoxide - Weekday Median & Average')\
                    + scale_fill_manual(values=("#21618c","#5dade2"))

In [None]:
ben_weekday_avg = train.groupby('weekday')['target_benzene'].mean().reset_index().rename(columns={'target_benzene':'values'})
ben_weekday_avg['type'] = 'Average'

ben_weekday_median = train.groupby('weekday')['target_benzene'].median().reset_index().rename(columns={'target_benzene':'values'})
ben_weekday_median['type'] = 'Median'

weekday_map = {'Monday':0,'Tuesday':1, 'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

ben_weekday_df = pd.concat([ben_weekday_avg, ben_weekday_median])

ben_weekday_df['weekday_value'] = ben_weekday_df['weekday'].map(weekday_map)

ggplot(ben_weekday_df, aes(x = 'reorder(weekday,weekday_value)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Benzene - Weekday Median & Average')\
                    + scale_fill_manual(values=("#943126","#ec7063"))

In [None]:
nit_weekday_avg = train.groupby('weekday')['target_nitrogen_oxides'].mean().reset_index().rename(columns={'target_nitrogen_oxides':'values'})
nit_weekday_avg['type'] = 'Average'

nit_weekday_median = train.groupby('weekday')['target_nitrogen_oxides'].median().reset_index().rename(columns={'target_nitrogen_oxides':'values'})
nit_weekday_median['type'] = 'Median'

weekday_map = {'Monday':0,'Tuesday':1, 'Wednesday':2,'Thursday':3,'Friday':4,'Saturday':5,'Sunday':6}

nit_weekday_df = pd.concat([nit_weekday_avg, nit_weekday_median])

nit_weekday_df['weekday_value'] = nit_weekday_df['weekday'].map(weekday_map)

ggplot(nit_weekday_df, aes(x = 'reorder(weekday,weekday_value)', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= '',y='Value') + ggtitle('Nitrogen Oxides - Weekday Median & Average')\
                    + scale_fill_manual(values=("#117864","#48c9b0"))

In [None]:
co_hour_avg = train.groupby('hour')['target_carbon_monoxide'].mean().reset_index().rename(columns={'target_carbon_monoxide':'values'})
co_hour_avg['type'] = 'Average'

co_hour_median = train.groupby('hour')['target_carbon_monoxide'].median().reset_index().rename(columns={'target_carbon_monoxide':'values'})
co_hour_median['type'] = 'Median'

co_hour_df = pd.concat([co_hour_avg, co_hour_median])

ggplot(co_hour_df, aes(x = 'hour', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= 'Time (Hour) - 24 Hr Clock',y='Value') + ggtitle('Carbon Monoxide - Hour of Day - Median & Average')\
                    + scale_fill_manual(values=("#21618c","#5dade2"))

In [None]:
ben_hour_avg = train.groupby('hour')['target_benzene'].mean().reset_index().rename(columns={'target_benzene':'values'})
ben_hour_avg['type'] = 'Average'

ben_hour_median = train.groupby('hour')['target_benzene'].median().reset_index().rename(columns={'target_benzene':'values'})
ben_hour_median['type'] = 'Median'

ben_hour_df = pd.concat([ben_hour_avg, ben_hour_median])

ggplot(ben_hour_df, aes(x = 'hour', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= 'Time (Hour) - 24 Hr Clock',y='Value') + ggtitle('Benzene - Hour of Day - Median & Average')\
                    + scale_fill_manual(values=("#943126","#ec7063"))

In [None]:
nit_hour_avg = train.groupby('hour')['target_nitrogen_oxides'].mean().reset_index().rename(columns={'target_nitrogen_oxides':'values'})
nit_hour_avg['type'] = 'Average'

nit_hour_median = train.groupby('hour')['target_nitrogen_oxides'].median().reset_index().rename(columns={'target_nitrogen_oxides':'values'})
nit_hour_median['type'] = 'Median'

nit_hour_df = pd.concat([nit_hour_avg, nit_hour_median])

ggplot(nit_hour_df, aes(x = 'hour', y='values', fill='type')) + geom_bar(position="dodge", stat='identity') \
                    + theme(figure_size=(16, 4)) + labs(fill='', x= 'Time (Hour) - 24 Hr Clock',y='Value') \
                    + ggtitle('Nitrogen Oxides - Hour of Day - Median & Average')\
                    + scale_fill_manual(values=("#117864","#48c9b0"))

### 2.2 Deg C

**Findings**
* The target is right skewed for deg_C as compared to the train data, which is near-normal.
* Deg_C is highly positively correlated with absolute humidity and sensor 4. And highly negatively correlated with relative humidity.
* There seems to be no relationship between Deg_C and target variables.

In [None]:
train_deg = train[['deg_C']]
train_deg['Source'] = 'Train Deg C'

test_deg = test[['deg_C']]
test_deg['Source'] = 'Test Deg C'

deg_df = pd.concat([train_deg, test_deg])

ggplot(deg_df, aes(x='deg_C', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(12, 4)) + labs(fill = '', x= 'Deg C',y='Density') \
                + ggtitle('Deg C - Target & Train Distribution')

In [None]:
# correlation plot with other features
corr_df = train[['deg_C','relative_humidity','absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5']]
corr_df = corr_df.corr()

# function for individual correlation
column_list = ['deg_C','relative_humidity','absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5']

def ind_corr(df, col):
    df = df[df.index == col]
    var_list = column_list.copy()
    df = pd.melt(df, id_vars = col, value_vars = var_list.remove(col))   
    return df

In [None]:
deg_c_corr = ind_corr(corr_df, 'deg_C')

ggplot(deg_c_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Deg_C with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'deg_C', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Deg_C', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'deg_C', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Deg_C', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'deg_C', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Deg_C', ylabel = 'Nitrogen Oxides')

plt.suptitle('Deg_C vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.3 Relative Humidity

**Findings**
* Both target and train data show a near-normal distribution. 
* Relative humidity shows a strong negative correlation with deg_C.
* There seems to be no relationship between relative humidity and target variables.

In [None]:
train_rh = train[['relative_humidity']]
train_rh['Source'] = 'Train'

test_rh = test[['relative_humidity']]
test_rh['Source'] = 'Test'

rh_df = pd.concat([train_rh, test_rh])

ggplot(rh_df, aes(x='relative_humidity', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Relative Humidity',y='Density') \
                + ggtitle('Relative Humidity - Target & Train Distribution')

In [None]:
rel_hum_corr = ind_corr(corr_df, 'relative_humidity')

ggplot(rel_hum_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Relative Humidity with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'relative_humidity', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Relative Humidity', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'relative_humidity', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Relative Humidity', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'relative_humidity', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Relative Humidity', ylabel = 'Nitrogen Oxides')

plt.suptitle('Relative Humidity vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.4 Absolute Humidity

**Findings**
* The target is right skewed for absolute humidity as compared to the train data.
* Absolute humidity is highly positively correlated with sensor 4. And negatively correlated with relative sensor 3.
* There seems to be no relationship between absolute humidity and target variables.

In [None]:
train_abs_h = train[['absolute_humidity']]
train_abs_h['Source'] = 'Train'

test_abs_h = test[['absolute_humidity']]
test_abs_h['Source'] = 'Test'

abs_h_df = pd.concat([train_abs_h, test_abs_h])

ggplot(abs_h_df, aes(x='absolute_humidity', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Absolute Humidity',y='Density') \
                + ggtitle('Absolute Humidity - Target & Train Distribution')

In [None]:
abs_hum_corr = ind_corr(corr_df, 'absolute_humidity')

ggplot(abs_hum_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Absolute Humidity with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'absolute_humidity', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Absolute Humidity', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'absolute_humidity', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Absolute Humidity', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'absolute_humidity', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Absolute Humidity', ylabel = 'Nitrogen Oxides')

plt.suptitle('Absolute Humidity vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.5 Sensor 1

**Findings**
* The train and target distributions are similar.
* High positive correlation with Sensor 2, Sensor 4 and Sensor 5. Only negatively correlated with Sensor 3.
* There is some linear relationship with the target variables.

In [None]:
train_sen_1 = train[['sensor_1']]
train_sen_1['Source'] = 'Train'

test_sen_1 = test[['sensor_1']]
test_sen_1['Source'] = 'Test'

sen_1_df = pd.concat([train_sen_1, test_sen_1])

ggplot(sen_1_df, aes(x='sensor_1', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Sensor 1',y='Density') \
                + ggtitle('Sensor 1 - Target & Train Distribution')

In [None]:
sen_1_corr = ind_corr(corr_df, 'sensor_1')

ggplot(sen_1_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Sensor 1 with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'sensor_1', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Sensor 1', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'sensor_1', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Sensor 1', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'sensor_1', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Sensor 1', ylabel = 'Nitrogen Oxides')

plt.suptitle('Sensor 1 vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.6 Sensor 2

In [None]:
train_sen_2 = train[['sensor_2']]
train_sen_2['Source'] = 'Train'

test_sen_2= test[['sensor_2']]
test_sen_2['Source'] = 'Test'

sen_2_df = pd.concat([train_sen_2, test_sen_2])

ggplot(sen_2_df, aes(x='sensor_2', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Sensor 2',y='Density') \
                + ggtitle('Sensor 2 - Target & Train Distribution')

In [None]:
sen_2_corr = ind_corr(corr_df, 'sensor_2')

ggplot(sen_2_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Sensor 2 with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'sensor_2', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Sensor 2', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'sensor_2', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Sensor 2', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'sensor_2', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Sensor 2', ylabel = 'Nitrogen Oxides')

plt.suptitle('Sensor 2 vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.7 Sensor 3

In [None]:
train_sen_3 = train[['sensor_3']]
train_sen_3['Source'] = 'Train'

test_sen_3 = test[['sensor_3']]
test_sen_3['Source'] = 'Test'

sen_3_df = pd.concat([train_sen_3, test_sen_3])

ggplot(sen_3_df, aes(x='sensor_3', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Sensor 3',y='Density') \
                + ggtitle('Sensor 3 - Target & Train Distribution')

In [None]:
sen_3_corr = ind_corr(corr_df, 'sensor_3')

ggplot(sen_3_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Sensor 3 with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'sensor_3', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Sensor 3', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'sensor_3', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Sensor 3', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'sensor_3', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Sensor 3', ylabel = 'Nitrogen Oxides')

plt.suptitle('Sensor 3 vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.8 Sensor 4

In [None]:
train_sen_4 = train[['sensor_4']]
train_sen_4['Source'] = 'Train'

test_sen_4 = test[['sensor_4']]
test_sen_4['Source'] = 'Test'

sen_4_df = pd.concat([train_sen_4, test_sen_4])

ggplot(sen_4_df, aes(x='sensor_4', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Sensor 4',y='Density') \
                + ggtitle('Sensor 4 - Target & Train Distribution')

In [None]:
sen_4_corr = ind_corr(corr_df, 'sensor_4')

ggplot(sen_4_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Sensor 4 with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'sensor_4', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Sensor 4', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'sensor_4', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Sensor 4', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'sensor_4', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Sensor 4', ylabel = 'Nitrogen Oxides')

plt.suptitle('Sensor 4 vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

## 2.9 Sensor 5

In [None]:
train_sen_5 = train[['sensor_5']]
train_sen_5['Source'] = 'Train'

test_sen_5 = test[['sensor_5']]
test_sen_5['Source'] = 'Test'

sen_5_df = pd.concat([train_sen_5, test_sen_5])

ggplot(sen_5_df, aes(x='sensor_5', fill='Source')) + \
    geom_density(alpha = 0.4) + theme(figure_size=(16, 4)) + labs(fill = '', x= 'Sensor 5',y='Density') \
                + ggtitle('Sensor 5 - Target & Train Distribution')

In [None]:
sen_5_corr = ind_corr(corr_df, 'sensor_5')

ggplot(sen_5_corr, aes(x = 'variable', y='value')) + geom_bar(position="dodge", stat='identity', fill = "grey") \
                    + theme(figure_size=(16, 4)) + labs(x= 'features',y='Value')\
                    + ggtitle('Correlation - Sensor 5 with other features') 

In [None]:
f, axes = plt.subplots(figsize=(20, 6), ncols=3)

p1 = sns.scatterplot(data = train, x = 'sensor_5', y = 'target_carbon_monoxide', ax = axes[0], color = 'blue')
axes[0].set(xlabel = 'Sensor 5', ylabel = 'Carbon Monoxide')

p2 = sns.scatterplot(data = train, x = 'sensor_5', y = 'target_benzene', ax = axes[1], color = 'red')
axes[1].set(xlabel = 'Sensor 5', ylabel = 'Benzene')

p3 = sns.scatterplot(data = train, x = 'sensor_5', y = 'target_nitrogen_oxides', ax = axes[2], color = 'green')
axes[2].set(xlabel = 'Sensor 5', ylabel = 'Nitrogen Oxides')

plt.suptitle('Sensor 5 vs Target Variables', fontsize = 16,fontweight='bold')
plt.tight_layout()
plt.show()

*The EDA shows the sensors are likely the strongest predictors. The day of week, especially weekends and time of day may have some influence on the target.*

**Next Step: Model**