# Tabular Playground Series - Jul 2021
Continued from [last time ](https://www.kaggle.com/astashiro/tps-jul2021-06rethink-features).

## LightAutoML

Let's try predicting with LightAutoML using the valid features that we have identified with Pycaret.  

The results were almost identical to Pycaret. When the features are the same, the results do not seem to change much.

In [None]:
!pip install -U lightautoml

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [None]:
df_train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
df_test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
df_train['IsTrain'] = 1
df_test['IsTrain'] = 0
df = pd.concat([df_train, df_test], sort=False,axis=0)

df['date_time'] = pd.to_datetime(df['date_time'])

df['day_of_week'] = df['date_time'].dt.dayofweek
df['hour'] = df['date_time'].dt.hour
df['working_hours'] =  df.hour.isin(np.arange(8, 21, 1)).astype("int")
df.loc[(df['date_time'].dt.month >= 3) & (df['date_time'].dt.month <= 5), 'season'] = 1
df.loc[(df['date_time'].dt.month >= 6) & (df['date_time'].dt.month <= 8), 'season'] = 2
df.loc[(df['date_time'].dt.month >= 9) & (df['date_time'].dt.month <= 11), 'season'] = 3
df.loc[(df['date_time'].dt.month == 12) | (df['date_time'].dt.month <= 2), 'season'] = 4

train = df.query('IsTrain == 1').drop(['IsTrain'], axis=1)
test =  df.query('IsTrain == 0').drop(['IsTrain','target_carbon_monoxide','target_benzene','target_nitrogen_oxides'], axis=1)

In [None]:
train.head()

### Predict with LightAutoML

In [None]:
def do_lightautoml(target, train):
    laml = TabularAutoML(task=Task('reg', loss='rmsle', metric='rmsle'),
                         timeout=3600, cpu_limit=4,
                         reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42},
                         general_params = {'use_algos': [['lgb_tuned', 'cb_tuned'], ['lgb', 'cb']]})
    laml.fit_predict(train_data=train, roles={'target': target })
    return(laml)

### Prediction when the sensor is on
#### Carbon monoxide
I use sensors 1, 2, and 5 with high correlation.

In [None]:
train1 = train.query('absolute_humidity >= 0.24').loc[:,['deg_C', 'relative_humidity','absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_5', 'season', 'working_hours', 'target_carbon_monoxide']]
train1

In [None]:
laml1 = do_lightautoml('target_carbon_monoxide', train1)

In [None]:
pred1 = laml1.predict(test).data.ravel()
pred1

#### Benzene
I use only sensor 2, which has a high correlation.

In [None]:
train2 = train.loc[:,['sensor_2','target_benzene']]
train2.head()

In [None]:
laml2 = do_lightautoml('target_benzene', train2)

In [None]:
pred2 = laml2.predict(test).data.ravel()
pred2

#### Nitrogen oxides
I tried excluding the sensors from the feature set one by one, and the score was better when sensor_1 was excluded.

In [None]:
train3 = train.query('absolute_humidity >= 0.24 & season >= 3').loc[:,['deg_C', 'relative_humidity','absolute_humidity', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'working_hours', 'target_nitrogen_oxides']]
train3

In [None]:
laml3 = do_lightautoml('target_nitrogen_oxides', train3)

In [None]:
pred3 = laml3.predict(test).data.ravel()
pred3

In [None]:
sub1 = pd.DataFrame({
    'date_time': test.date_time,
    'target_carbon_monoxide': pred1,
    'target_benzene': pred2,
    'target_nitrogen_oxides': pred3
})

sub1.head()

### Prediction when the sensor is off
#### Carbon monoxide

In [None]:
train4 = train.query('season >= 3').loc[:,['day_of_week', 'hour', 'season', 'working_hours', 'target_carbon_monoxide']]
train4

In [None]:
laml4 = do_lightautoml('target_carbon_monoxide', train4)

In [None]:
pred4 = laml4.predict(test).data.ravel()
pred4

#### Nitrogen oxides

In [None]:
train5 = train.query('season >= 3').loc[:,['day_of_week', 'hour', 'season', 'working_hours', 'target_nitrogen_oxides']]
train5

In [None]:
laml5 = do_lightautoml('target_nitrogen_oxides', train5)

In [None]:
pred5 = laml5.predict(test).data.ravel()
pred5

In [None]:
sub2 = pd.DataFrame({
    'date_time': test.date_time,
    'target_carbon_monoxide': pred4,
    'target_benzene': pred2,
    'target_nitrogen_oxides': pred5
})

sub2

### Merge predictions

In [None]:
sub_temp1 = sub1.query("date_time < '2011-01-02 21:00:00'")
sub_temp2 = sub2.query("date_time >= '2011-01-02 21:00:00' & date_time <= '2011-01-05 00:00:00'")
sub_temp3 = sub1.query("date_time > '2011-01-05 00:00:00' & date_time < '2011-01-28 17:00:00'")
sub_temp4 = sub1.query("date_time >= '2011-01-28 17:00:00' & date_time <= '2011-01-29 01:00:00'")
sub_temp5 = sub1.query("date_time > '2011-01-29 01:00:00' & date_time < '2011-02-08 17:00:00'")
sub_temp6 = sub2.query("date_time >= '2011-02-08 17:00:00' & date_time <= '2011-02-11 20:00:00'")
sub_temp7 = sub1.query("date_time > '2011-02-11 20:00:00'")

submission = pd.concat([sub_temp1, sub_temp2, sub_temp3, sub_temp4, sub_temp5, sub_temp6, sub_temp7], sort=False,axis=0)

In [None]:
submission

In [None]:
submission.to_csv('LightAutoML_submission.csv',index=False)