In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from math import sqrt
from sklearn.model_selection import LeaveOneGroupOut
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load Data

In [None]:
sDir = '../input/tabular-playground-series-jul-2021/'
sTrain = 'train.csv'
sTest = 'test.csv'
pdTrain = pd.read_csv(sDir + sTrain)
trainSize, col = pdTrain.shape
pdTest = pd.read_csv(sDir + sTest)
testSize, col = pdTest.shape
pdAll = pd.concat([pdTrain, pdTest])
pdAll.info()

In [None]:
rows, cols = pdAll.shape
print(f'rows: {rows} | columns: {cols}\n')
pdAll.describe().T

# Check Stats

In [None]:
#define lambda function, detect if any columns contain isnull() value 
stats = pdAll[['deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2','sensor_3','sensor_4','sensor_5']]\
.apply(lambda col: sum(col.isna())/rows )
stats[stats>0].count()

In [None]:
#or use pd.isna() function to check missing value columns 
stats = pd.isna(pdAll.count())
stats[stats>0].count()

## Add time series columns


In [None]:
pdAll['date_time'] = pd.to_datetime(pdAll['date_time'])
pdAll['year']=pdAll['date_time'].dt.year
pdAll['month']=pdAll['date_time'].dt.month
pdAll['week']=pdAll['date_time'].dt.week
pdAll['day']=pdAll['date_time'].dt.day
pdAll['dayofweek']=pdAll['date_time'].dt.dayofweek
pdAll['time'] = pdAll['date_time'].dt.date - pdAll['date_time'].dt.date.min()
pdAll['time']=pdAll['time'].apply(lambda x: x.days)
pdAll['hour'] = pdAll['date_time'].dt.hour

## Add Derived features

In [None]:
pdAll['Humidity'] = (pdAll['absolute_humidity'] * 100)/pdAll['relative_humidity']
pdAll['Dew_Point'] = 243.12*(np.log(pdAll['relative_humidity'] * 0.01) + \
                 (17.62 * pdAll['deg_C'])/(243.12+pdAll['deg_C']))/(17.62-(np.log(pdAll['relative_humidity'] * 0.01) + \
                  17.62 * pdAll['deg_C']/(243.12+pdAll['deg_C'])))
pdAll.head()

In [None]:
stdCol =['deg_C', 'relative_humidity', 'absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5',\
         'Humidity','Dew_Point', 'year','month', 'week', 'day', 'dayofweek', 'time', 'hour']
tgtCol = ['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']
pdStd = pd.DataFrame(pdAll, columns=stdCol)
pdStd

# EDA

### Sensor Trend charts

In [None]:
sns.set()
# init figure and ax
y = pdAll.loc[:, ['date_time','sensor_3','sensor_4' ]].\
    set_index('date_time').resample('W').mean() 
fig,ax=plt.subplots(1,1,figsize=(15,8))
# plotting multi-series by passing y with multiple columns
# set up x label, y label, and figure title
y.plot(ax=ax,label='by week(train)',alpha=1)
ax.set_ylabel('sensors',fontsize=10)
ax.set_xlabel('by week', fontsize=10)
ax.set_title('sensor_3 sensor_4 weekly trend', fontsize=12);

In [None]:
y = pdAll.loc[:, ['date_time','sensor_1','sensor_2', 'sensor_5' ]].\
    set_index('date_time').resample('W').mean() 
fig,ax=plt.subplots(1,1,figsize=(15,8))
# plotting multi-series by passing y with multiple columns
# set up x label, y label, and figure title
y.plot(ax=ax,label='by week(train)',alpha=1)
ax.set_ylabel('sensors',fontsize=10)
ax.set_xlabel('by week', fontsize=10)
ax.set_title('sensor_1 sensor_2 sensor_5 weekly trend', fontsize=12);

### Histgram

In [None]:
stdCol2 =['deg_C', 'relative_humidity', 'absolute_humidity','sensor_1','sensor_2','sensor_3','sensor_4','sensor_5',\
         'Humidity','Dew_Point']
rows = int(len(stdCol2)/2)
fig, axe = plt.subplots(nrows = rows, ncols = 2, figsize=(15,20), sharex=False, sharey = False)
for i in range(rows):  
    sns.histplot(pdStd[stdCol[i*2]], kde=True, ax=axe[i,0]);
    sns.histplot(pdStd[stdCol[i*2 + 1]], kde=True, ax=axe[i,1]);

In [None]:
sns.jointplot("deg_C", "Dew_Point", data=pdStd, kind="reg");

### Heatmap

In [None]:
corr = pdStd.corr()
fig, ax = plt.subplots(1,1, figsize=(10,10))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

# Training

## Train test split

In [None]:
pdTrain = pdStd[: trainSize-1]
pdTest = pdStd[trainSize :]
pdTrain

In [None]:
pdTrain_cab = pdTrain.copy(deep=True)
pdTrain_cab['target_carbon_monoxide'] =  np.log1p(pdAll[: trainSize-1]['target_carbon_monoxide'].tolist());
pdTrain_ben = pdTrain.copy(deep=True)
pdTrain_ben['target_benzene'] = np.log1p( pdAll[: trainSize-1]['target_benzene'].tolist());
pdTrain_nio = pdTrain.copy(deep=True)
pdTrain_nio['target_nitrogen_oxides'] = np.log1p(pdAll[: trainSize-1]['target_nitrogen_oxides'].tolist());

## Train - Carbin

In [None]:
!pip install pycaret

In [None]:
from pycaret.regression import *
exp_reg = setup(data = pdTrain_cab, target = 'target_carbon_monoxide', normalize = True,
                fold_strategy= LeaveOneGroupOut(), fold_groups = pdTrain_cab['week'],  silent = True )

In [None]:

%%time
top5_carbin = compare_models(n_select = 5)

In [None]:
%%time
blender_et_lightgbm_boosting = finalize_model(blend_models(estimator_list = top5_carbin, optimize ='RMSLE'))

In [None]:
t = predict_model(blender_et_lightgbm_boosting)
t.info()

In [None]:
plot_model(blender_et_lightgbm_boosting)

In [None]:
plot_model(blender_et_lightgbm_boosting, plot = 'error')

In [None]:
save_model(blender_et_lightgbm_boosting,'blender_carbin')

## Train - Benzene

In [None]:
%%time
from pycaret.regression import *
exp_reg_ben = setup(data = pdTrain_ben, target = 'target_benzene', normalize = True,
                fold_strategy= LeaveOneGroupOut(), fold_groups = pdTrain_ben['week'],  silent = True )
print('start comparing models')
top5_ben = compare_models(n_select = 5)
print('start blending models')
blender_boosting_ben = finalize_model( blend_models(estimator_list = top5_ben, optimize ='RMSLE'))

In [None]:
predict_model(blender_boosting_ben)

In [None]:
save_model(blender_boosting_ben,'blender_benzene')

## Train - nitrogen oxides

In [None]:
%%time
from pycaret.regression import *
exp_reg_nig = setup(data = pdTrain_nio, target = 'target_nitrogen_oxides', normalize = True,
                fold_strategy= LeaveOneGroupOut(), fold_groups = pdTrain_nio['week'],  silent = True )
print('start comparing models')
top5_nig = compare_models(n_select = 5)
print('start blending models')
blender_boosting_nig = finalize_model(blend_models(estimator_list = top5_nig, optimize='RMSLE'))
print('start saving model')
save_model(blender_boosting_nig,'blender_nitrogen')

# Scoring

In [None]:
blender_carbin = load_model('blender_carbin')
predict_carbin = predict_model(blender_carbin, data=pdTest)
blender_benzene = load_model('blender_benzene')
predict_benzene = predict_model(blender_benzene, data=pdTest)
blender_nitrogen = load_model('blender_nitrogen')
predict_nitrogen = predict_model(blender_nitrogen, data = pdTest)

In [None]:
pdSub['target_carbon_monoxide']= np.exp(predict_carbin['Label'])-1
pdSub['target_benzene'] = np.exp(predict_benzene['Label'])-1
pdSub['target_nitrogen_oxides'] = np.exp(predict_nitrogen['Label'])-1
pdSub.to_csv('submission4.csv', index = False)