# Tabular Playground Series - Jul 2021
Continued from [last time ](https://www.kaggle.com/astashiro/tps-jul2021-05divide-and-predict).

## Remove unwanted data and features

Removed sensor_3 and sensoe_4 from the features of the training data for carbon monoxide prediction.  
Removed sensor_1 from the features of the training data in the prediction of nitrogen compounds.

In [None]:
!pip install pycaret

In [None]:
!pip install shap

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from pycaret.regression import setup, blend_models, create_model, finalize_model, plot_model, predict_model, interpret_model
import shap
from fbprophet import Prophet

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
df_train['IsTrain'] = 1
df_test['IsTrain'] = 0
df = pd.concat([df_train, df_test], sort=False,axis=0)

df['date_time'] = pd.to_datetime(df['date_time'])

df['day_of_week'] = df['date_time'].dt.dayofweek
df['hour'] = df['date_time'].dt.hour
df['working_hours'] =  df.hour.isin(np.arange(8, 21, 1)).astype("int")
df.loc[(df['date_time'].dt.month >= 3) & (df['date_time'].dt.month <= 5), 'season'] = 1
df.loc[(df['date_time'].dt.month >= 6) & (df['date_time'].dt.month <= 8), 'season'] = 2
df.loc[(df['date_time'].dt.month >= 9) & (df['date_time'].dt.month <= 11), 'season'] = 3
df.loc[(df['date_time'].dt.month == 12) | (df['date_time'].dt.month <= 2), 'season'] = 4

train = df.query('IsTrain == 1').drop(['IsTrain'], axis=1)
test =  df.query('IsTrain == 0').drop(['IsTrain','target_carbon_monoxide','target_benzene','target_nitrogen_oxides'], axis=1)

In [None]:
train.head()

### Why divide?

Look at the areas where the thermometer, hygrometer, and sensors are not working as shown below.
Benzene is linked to a value close to zero, but carbon monoxide and nitrogen oxide are spiking independently of each sensor. Therefore, I thought it would be better to predict the carbon monoxide and nitrogen oxide separately from the part where the sensors are off.

In [None]:
sel_train = df[6600:6900].copy()
cols = ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides','deg_C','absolute_humidity', 'sensor_2']
for col in sel_train[cols].columns:
    plt.figure(figsize=(16,4))
    plt.plot(sel_train.date_time, sel_train[col])
    plt.ylabel(col)
    plt.show()

Check the correlation by excluding the area where the sensor is not working.

In [None]:
sel_train2 = train.query('absolute_humidity >= 0.24').loc[:,['target_carbon_monoxide','target_benzene','target_nitrogen_oxides','sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']]
plt.figure(figsize=(10,10))
sns.heatmap(sel_train2.corr(), vmax=1, square=True,annot=True,cmap='RdBu')
plt.show()

In [None]:
sns.pairplot(sel_train2)

### Predict with Pycaret

In [None]:
def do_pycaret(target, train, test, categorical_features):
    reg = setup(data=train, target=target, categorical_features=categorical_features, silent= True)
    catboost = create_model("catboost")
    et = create_model("et")
    lightgbm = create_model("lightgbm")
    gbr = create_model("gbr")
    rf = create_model("rf")
    blend = blend_models(estimator_list= [catboost, et, lightgbm, gbr, rf], optimize='RMSLE')
    predh = predict_model(blend)
    final = finalize_model(blend)
    pred = predict_model(final, data=test)
    return(pred, catboost, lightgbm, rf)

### Prediction when the sensor is on
#### Carbon monoxide
I use sensors 1, 2, and 5 with high correlation.

In [None]:
train1 = train.query('absolute_humidity >= 0.24').loc[:,['deg_C', 'relative_humidity','absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_5', 'season', 'working_hours', 'target_carbon_monoxide']]
train1

In [None]:
pred1, catboost1, lightgbm1, rf1 = do_pycaret('target_carbon_monoxide', train1, test, ['season'])
pred1

In [None]:
interpret_model(catboost1)

In [None]:
interpret_model(lightgbm1)

In [None]:
interpret_model(rf1)

#### Benzene
I use only sensor 2, which has a high correlation.

In [None]:
train2 = train.loc[:,['sensor_2','target_benzene']]
train2.head()

In [None]:
pred2, catboost2, lightgbm2, rf2 = do_pycaret('target_benzene', train2, test, None)
pred2

In [None]:
interpret_model(catboost2)

In [None]:
interpret_model(lightgbm2)

In [None]:
interpret_model(rf2)

#### Nitrogen oxides
I tried excluding the sensors from the feature set one by one, and the score was better when sensor_1 was excluded.

In [None]:
train3 = train.query('absolute_humidity >= 0.24 & season >= 3').loc[:,['deg_C', 'relative_humidity','absolute_humidity', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'working_hours', 'target_nitrogen_oxides']]
train3

In [None]:
pred3, catboost3, lightgbm3, rf3 = do_pycaret('target_nitrogen_oxides', train3, test, None)
pred3

In [None]:
interpret_model(catboost3)

In [None]:
interpret_model(lightgbm3)

In [None]:
interpret_model(rf3)

In [None]:
sub1 = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sub1.target_carbon_monoxide = pred1.Label
sub1.target_benzene = pred2.Label
sub1.target_nitrogen_oxides = pred3.Label
sub1

### Prediction when the sensor is off
#### Carbon monoxide

In [None]:
train4 = train.query('season >= 3').loc[:,['day_of_week', 'hour', 'season', 'working_hours', 'target_carbon_monoxide']]
train4

In [None]:
pred4, catboost4, lightgbm4, rf4 = do_pycaret('target_carbon_monoxide', train4, test, ['season'])
pred4

In [None]:
interpret_model(catboost4)

In [None]:
interpret_model(lightgbm4)

In [None]:
interpret_model(rf4)

#### Nitrogen oxides

In [None]:
train5 = train.query('season >= 3').loc[:,['day_of_week', 'hour', 'season', 'working_hours', 'target_nitrogen_oxides']]
train5

In [None]:
pred5, catboost5, lightgbm5, rf5 = do_pycaret('target_nitrogen_oxides', train5, test, ['season'])
pred5

In [None]:
interpret_model(catboost5)

In [None]:
interpret_model(lightgbm5)

In [None]:
interpret_model(rf5)

In [None]:
sub2 = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')
sub2.target_carbon_monoxide = pred4.Label
sub2.target_benzene = pred2.Label
sub2.target_nitrogen_oxides = pred5.Label
sub2

### Merge predictions

In [None]:
sub_temp1 = sub1.query("date_time < '2011-01-02 21:00:00'")
sub_temp2 = sub2.query("date_time >= '2011-01-02 21:00:00' & date_time <= '2011-01-05 00:00:00'")
sub_temp3 = sub1.query("date_time > '2011-01-05 00:00:00' & date_time < '2011-01-28 17:00:00'")
sub_temp4 = sub1.query("date_time >= '2011-01-28 17:00:00' & date_time <= '2011-01-29 01:00:00'")
sub_temp5 = sub1.query("date_time > '2011-01-29 01:00:00' & date_time < '2011-02-08 17:00:00'")
sub_temp6 = sub2.query("date_time >= '2011-02-08 17:00:00' & date_time <= '2011-02-11 20:00:00'")
sub_temp7 = sub1.query("date_time > '2011-02-11 20:00:00'")

submission = pd.concat([sub_temp1, sub_temp2, sub_temp3, sub_temp4, sub_temp5, sub_temp6, sub_temp7], sort=False,axis=0)

In [None]:
submission

In [None]:
submission.to_csv('pycaret_prophet_submission.csv',index=False)