# Tabular Playground Series - Jul 2021
Continued from [last time ](https://www.kaggle.com/astashiro/tps-jul2021-04pycaret-and-prophet).

## Divide and predict

Use Pycaret to predict when the sensor is on and when it is off separately.

In [None]:
!pip install pycaret

In [None]:
!pip install shap

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pycaret.regression import setup, blend_models, create_model, finalize_model, plot_model, predict_model, interpret_model
import shap
from fbprophet import Prophet

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
df_train['IsTrain'] = 1
df_test['IsTrain'] = 0
df = pd.concat([df_train, df_test], sort=False,axis=0)

df['date_time'] = pd.to_datetime(df['date_time'])

df['day_of_week'] = df['date_time'].dt.dayofweek
df['hour'] = df['date_time'].dt.hour
df['working_hours'] =  df.hour.isin(np.arange(8, 21, 1)).astype("int")
df.loc[(df['date_time'].dt.month == 12) | (df['date_time'].dt.month <= 2), 'season'] = 1
df.loc[(df['date_time'].dt.month >= 3) & (df['date_time'].dt.month <= 5), 'season'] = 2
df.loc[(df['date_time'].dt.month >= 6) & (df['date_time'].dt.month <= 8), 'season'] = 3
df.loc[(df['date_time'].dt.month >= 9) & (df['date_time'].dt.month <= 11), 'season'] = 4

train = df.query('IsTrain == 1').drop(['IsTrain'], axis=1)
test =  df.query('IsTrain == 0').drop(['IsTrain','target_carbon_monoxide','target_benzene','target_nitrogen_oxides'], axis=1)

In [None]:
train.head()

### Why divide?

Look at the areas where the thermometer, hygrometer, and sensors are not working as shown below.
Benzene is linked to a value close to zero, but carbon monoxide and nitrogen oxide are spiking independently of each sensor. Therefore, I thought it would be better to predict the carbon monoxide and nitrogen oxide separately from the part where the sensors are off.

In [None]:
sel_train = df[6600:6900].copy()
cols = ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides','deg_C','absolute_humidity', 'sensor_2']
for col in sel_train[cols].columns:
    plt.figure(figsize=(16,4))
    plt.plot(sel_train.date_time, sel_train[col])
    plt.ylabel(col)
    plt.show()

### Prediction when the sensor is on
#### Carbon monoxide

In [None]:
train1 = train.query('absolute_humidity >= 0.24').loc[:,['deg_C', 'relative_humidity','absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'season', 'working_hours', 'target_carbon_monoxide']]
train1

In [None]:
reg1 = setup(data=train1, target='target_carbon_monoxide', categorical_features=['season'], silent= True, session_id=1)
catboost1 = create_model("catboost")
et1 = create_model("et")
lightgbm1 = create_model("lightgbm")
gbr1 = create_model("gbr")
rf1 = create_model("rf")
blend1 = blend_models(estimator_list= [catboost1, et1, lightgbm1, gbr1, rf1], optimize='RMSLE')
predh1 = predict_model(blend1)
predh1

In [None]:
final1 = finalize_model(blend1)
pred1 = predict_model(final1, data=test)
pred1

In [None]:
interpret_model(catboost1)

#### Benzene

In [None]:
train2 = train.loc[:,['sensor_2','target_benzene']]
train2.head()

In [None]:
reg2 = setup(data=train2, target='target_benzene', silent= True, session_id=2)
catboost2 = create_model("catboost")
et2 = create_model("et")
lightgbm2 = create_model("lightgbm")
gbr2 = create_model("gbr")
rf2 = create_model("rf")
blend2 = blend_models(estimator_list= [catboost2, et2, lightgbm2, gbr2, rf2], optimize='RMSLE')
predh2 = predict_model(blend2)
predh2

In [None]:
final2 = finalize_model(blend2)
pred2 = predict_model(final2, data=test)
pred2

In [None]:
interpret_model(catboost2)

#### Nitrogen oxides

In [None]:
train3 = train.query('absolute_humidity >= 0.24').loc[:,['deg_C', 'relative_humidity','absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'season', 'working_hours', 'target_nitrogen_oxides']]
train3

In [None]:
reg3 = setup(data=train3, target='target_nitrogen_oxides', categorical_features=['season'], silent= True, session_id=3)
catboost3 = create_model("catboost")
et3 = create_model("et")
lightgbm3 = create_model("lightgbm")
gbr3 = create_model("gbr")
rf3 = create_model("rf")
blend3 = blend_models(estimator_list= [catboost3, et3, lightgbm3, gbr3, rf3], optimize='RMSLE')
predh3 = predict_model(blend3)
predh3

In [None]:
final3 = finalize_model(blend3)
pred3 = predict_model(final3, data=test)
pred3

In [None]:
interpret_model(catboost3)

In [None]:
sub1 = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sub1.target_carbon_monoxide = pred1.Label
sub1.target_benzene = pred2.Label
sub1.target_nitrogen_oxides = pred3.Label
sub1

### Prediction when the sensor is off
#### Carbon monoxide

#### Carbon monoxide

In [None]:
train4 = train.loc[:,['day_of_week', 'hour', 'season', 'working_hours', 'target_carbon_monoxide']]
train4

In [None]:
reg4 = setup(data=train4, target='target_carbon_monoxide', categorical_features=['season'], silent= True, session_id=4)
catboost4 = create_model("catboost")
et4 = create_model("et")
lightgbm4 = create_model("lightgbm")
gbr4 = create_model("gbr")
rf4 = create_model("rf")
blend4 = blend_models(estimator_list= [catboost4, et4, lightgbm4, gbr4, rf4], optimize='RMSLE')
predh4 = predict_model(blend4)
predh4

In [None]:
final4 = finalize_model(blend4)
pred4 = predict_model(final4, data=test)
pred4

In [None]:
interpret_model(catboost4)

#### Nitrogen oxides

In [None]:
train5 = train.loc[:,['day_of_week', 'hour', 'season', 'working_hours', 'target_nitrogen_oxides']]
train5

In [None]:
reg5 = setup(data=train5, target='target_nitrogen_oxides', categorical_features=['season'], silent= True, session_id=5)
catboost5 = create_model("catboost")
et5 = create_model("et")
lightgbm5 = create_model("lightgbm")
gbr5 = create_model("gbr")
rf5 = create_model("rf")
blend5 = blend_models(estimator_list= [catboost5, et5, lightgbm5, gbr5, rf5], optimize='RMSLE')
predh5 = predict_model(blend5)
predh5

In [None]:
final5 = finalize_model(blend5)
pred5 = predict_model(final5, data=test)
pred5

In [None]:
interpret_model(catboost5)

In [None]:
sub2 = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sub2.target_carbon_monoxide = pred4.Label
sub2.target_benzene = pred2.Label
sub2.target_nitrogen_oxides = pred5.Label
sub2

### Merge predictions

In [None]:
sub_temp1 = sub1.query("date_time < '2011-01-02 21:00:00'")
sub_temp2 = sub2.query("date_time >= '2011-01-02 21:00:00' & date_time <= '2011-01-05 00:00:00'")
sub_temp3 = sub1.query("date_time > '2011-01-05 00:00:00' & date_time < '2011-01-28 17:00:00'")
sub_temp4 = sub2.query("date_time >= '2011-01-28 17:00:00' & date_time <= '2011-01-29 01:00:00'")
sub_temp5 = sub1.query("date_time > '2011-01-29 01:00:00' & date_time < '2011-02-08 17:00:00'")
sub_temp6 = sub2.query("date_time >= '2011-02-08 17:00:00' & date_time <= '2011-02-11 20:00:00'")
sub_temp7 = sub1.query("date_time > '2011-02-11 20:00:00'")

submission = pd.concat([sub_temp1, sub_temp2, sub_temp3, sub_temp4, sub_temp5, sub_temp6, sub_temp7], sort=False,axis=0)

In [None]:
submission

In [None]:
submission.to_csv('pycaret_prophet_submission.csv',index=False)