# Tabular Playground
## Data loading and preprocessing

Following the same steps as the other notebook I will standardize the data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv', 
                    parse_dates=["date_time"])
train = train.set_index('date_time')
target = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]
train = train.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=1)
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv',
                  parse_dates=["date_time"])
test = test.set_index('date_time')

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.2)

In [None]:
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method='box-cox')
X_train_sc = pd.DataFrame(data = pt.fit_transform(X_train), columns=X_train.columns, 
                          index=X_train.index)
X_val_sc = pd.DataFrame(data = pt.transform(X_val), columns=X_val.columns, 
                          index=X_val.index)

fig = X_train_sc.hist(figsize=(100, 100), bins=30)
[x.title.set_size(80) for x in fig.ravel()]
plt.show()

## Model selection

As always we will try linear models as baseline, then random forest, xgboost, ligthboost, catboost and finally and ensemble.

In [None]:
from sklearn.metrics import mean_squared_log_error

def score_model(model, tr, y_train, val, y_val, fitted=False):
    preds = [[],[],[]]
    for i in range(3):
        if not fitted:
            model.fit(tr, y_train.iloc[:,i])
        preds[i] = model.predict(val)
    return mean_squared_log_error(y_val, np.array(preds).T)

### Linear model

In [None]:
from sklearn.linear_model import GammaRegressor

model = GammaRegressor()
print('Baseline error (scaled):',
      "{0:.4f}".format(score_model(model, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(model, X_train, y_train, X_val, y_val)))

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)
print('Random Forest (scaled):',
      "{0:.4f}".format(score_model(model, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(model, X_train, y_train, X_val, y_val)))

## XGBoost

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, objective='reg:gamma')
print('XGBoost (scaled):',
      "{0:.4f}".format(score_model(model, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(model, X_train, y_train, X_val, y_val)))

## LightGBM

In [None]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimators=500, learning_rate=0.01, random_state=0, objective='gamma')
print('LightGBM (scaled):',
      "{0:.4f}".format(score_model(model, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(model, X_train, y_train, X_val, y_val)))

## CatBoost

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=0, verbose=0,
                          objective='Tweedie:variance_power=1.5') # Gamma regression
print('CatBoost (scaled):',
      "{0:.4f}".format(score_model(model, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(model, X_train, y_train, X_val, y_val)))

## Ensemble

In [None]:
from random import sample

sampling = np.array(sample(list(X_train.index),len(X_train))).reshape((4,-1))
  
X_train0 = X_train[X_train.index.isin(sampling[0])]
y_train0 = y_train[y_train.index.isin(sampling[0])]

X_train1 = X_train[X_train.index.isin(sampling[1])]
y_train1 = y_train[y_train.index.isin(sampling[1])]

X_train2 = X_train[X_train.index.isin(sampling[2])]
y_train2 = y_train[y_train.index.isin(sampling[2])]

X_train3 = X_train[X_train.index.isin(sampling[3])]
y_train3 = y_train[y_train.index.isin(sampling[3])]

preds = [[],[],[]]
for i in range(3):
    model0 = CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=0, verbose=0,
                              objective='Tweedie:variance_power=1.5') 
    model0.fit(X_train0, y_train0.iloc[:,i])

    model1 = RandomForestRegressor(n_estimators=100, random_state=0)
    model1.fit(X_train1, y_train1.iloc[:,i])

    model2 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, objective='reg:gamma')
    model2.fit(X_train2, y_train2.iloc[:,i])

    model3 = LGBMRegressor(n_estimators=500, learning_rate=0.01, random_state=0, objective='gamma')
    model3.fit(X_train3, y_train3.iloc[:,i])

    preds[i] = pd.DataFrame()
    preds[i]['Catboost'] = model0.predict(X_val)
    preds[i]['RandomForest'] = model1.predict(X_val)
    preds[i]['XGBoost'] = model2.predict(X_val)
    preds[i]['LGBM'] = model3.predict(X_val)

preds = pd.DataFrame(np.array(preds).mean(axis=2).T, columns=y_val.columns, index=y_val.index)
print('Ensemble error:',
     mean_squared_log_error(y_val, preds))

In [None]:
from random import sample

sampling = np.array(sample(list(X_train.index),len(X_train))).reshape((4,-1))
  
X_train_sc0 = X_train_sc[X_train_sc.index.isin(sampling[0])]
y_train0 = y_train[y_train.index.isin(sampling[0])]

X_train_sc1 = X_train_sc[X_train_sc.index.isin(sampling[1])]
y_train1 = y_train[y_train.index.isin(sampling[1])]

X_train_sc2 = X_train_sc[X_train_sc.index.isin(sampling[2])]
y_train2 = y_train[y_train.index.isin(sampling[2])]

X_train_sc3 = X_train_sc[X_train_sc.index.isin(sampling[3])]
y_train3 = y_train[y_train.index.isin(sampling[3])]

preds = [[],[],[]]
for i in range(3):
    model0 = CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=0, verbose=0,
                              objective='Tweedie:variance_power=1.5') 
    model0.fit(X_train_sc0, y_train0.iloc[:,i])

    model1 = RandomForestRegressor(n_estimators=100, random_state=0)
    model1.fit(X_train_sc1, y_train1.iloc[:,i])

    model2 = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, objective='reg:gamma')
    model2.fit(X_train_sc2, y_train2.iloc[:,i])

    model3 = LGBMRegressor(n_estimators=500, learning_rate=0.01, random_state=0, objective='gamma')
    model3.fit(X_train_sc3, y_train3.iloc[:,i])

    preds[i] = pd.DataFrame()
    preds[i]['Catboost'] = model0.predict(X_val_sc)
    preds[i]['RandomForest'] = model1.predict(X_val_sc)
    preds[i]['XGBoost'] = model2.predict(X_val_sc)
    preds[i]['LGBM'] = model3.predict(X_val_sc)

preds = pd.DataFrame(np.array(preds).mean(axis=2).T, columns=y_val.columns, index=y_val.index)
print('Ensemble error (scaled):',
     mean_squared_log_error(y_val, preds))

In [None]:
from sklearn.ensemble import StackingRegressor

reg = StackingRegressor(
        estimators=[
            ('RandomForest', RandomForestRegressor(n_estimators=100, random_state=0)),
            ('XGBoost', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, objective='reg:gamma')),
            ('LGBM', LGBMRegressor(n_estimators=500, learning_rate=0.01, random_state=0, objective='gamma')),
            ('CatBoost', CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=0, verbose=0,
                              objective='Tweedie:variance_power=1.5'))
        ],
        final_estimator=GammaRegressor()
     )

print('Stacking (scaled):',
      "{0:.4f}".format(score_model(reg, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(reg, X_train, y_train, X_val, y_val)))

In [None]:
from sklearn.ensemble import VotingRegressor

reg = VotingRegressor(
        estimators=[
            ('RandomForest', RandomForestRegressor(n_estimators=100, random_state=0)),
            ('XGBoost', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, objective='reg:gamma')),
            ('LGBM', LGBMRegressor(n_estimators=500, learning_rate=0.01, random_state=0, objective='gamma')),
            ('CatBoost', CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=0, verbose=0,
                              objective='Tweedie:variance_power=1.5'))
        ]
     )

print('Voting (scaled):',
      "{0:.4f}".format(score_model(reg, X_train_sc, y_train, X_val_sc, y_val)),
      '\nWithout scaling:',
      "{0:.4f}".format(score_model(reg, X_train, y_train, X_val, y_val)))

## Prediction

It seems the voting method with the four methods is the best of all.

In [None]:
final_model = VotingRegressor(
        estimators=[
            ('RandomForest', RandomForestRegressor(n_estimators=100, random_state=0)),
            ('XGBoost', XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=0, objective='reg:gamma')),
            ('LGBM', LGBMRegressor(n_estimators=500, learning_rate=0.01, random_state=0, objective='gamma')),
            ('CatBoost', CatBoostRegressor(n_estimators=100, learning_rate=0.1, random_state=0, verbose=0,
                              objective='Tweedie:variance_power=1.5'))
        ]
     )
preds = [[],[],[]]
for i in range(3):
    final_model.fit(train, target.iloc[:,i])
    preds[i] = final_model.predict(test)

In [None]:
preds = pd.DataFrame(data=np.array(preds).T, columns=target.columns, index=test.index)

In [None]:
fig, ax = plt.subplots(3,1, figsize=(50,20))
target.target_benzene.plot(ax=ax[0])
preds.target_benzene.plot(ax=ax[0])

target.target_carbon_monoxide.plot(ax=ax[1])
preds.target_carbon_monoxide.plot(ax=ax[1])

target.target_nitrogen_oxides.plot(ax=ax[2])
preds.target_nitrogen_oxides.plot(ax=ax[2])
plt.show()

In [None]:
preds.reset_index().to_csv('submission.csv', index=False)