In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

I quoted from the notebook below.
Thanks to the author for sharing it.

- https://www.kaggle.com/maksymshkliarevskyi/tps-july-eda-baseline-analysis-xgbregressor
- https://www.kaggle.com/dwin183287/tps-july-2021-eda
- https://www.kaggle.com/tetsuya777/tps-july-first-model-lightgbm-ipynb


Please refer to the notebook below for the "xgboost" version.
- https://www.kaggle.com/hirazawahiroshi/jul-2021-simple-baseline-xgboost

In [None]:
# import datasets
train_df = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

# "Date-time" as time series data
train_df['date_time'] = pd.to_datetime(train_df['date_time'])
test_df['date_time'] = pd.to_datetime(test_df['date_time'])


In [None]:
train_df.info()

In [None]:
features_list = [
    'deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 
    'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5'
    ]

targets_list = [
    'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'
    ]

In [None]:
# Create data sets for training (80%) and validation (20%)
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df, train_df[targets_list], test_size = 0.2, 
    random_state = 123, shuffle = False)

In [None]:
# The basic model (lightGBM)

# custom objective RMSE -> RMSLE
# y -> t (=np.log1p(y))

# LightGBM Dataset
train_CO = lgb.Dataset(
    X_train[features_list], 
    np.log1p(y_train.loc[:, 'target_carbon_monoxide'])
    )
valied_CO = lgb.Dataset(
    X_valid[features_list], 
    np.log1p(y_valid.loc[:, 'target_carbon_monoxide'])
    )
train_C6H6 = lgb.Dataset(
    X_train[features_list], 
    np.log1p(y_train.loc[:, 'target_benzene'])
    )
valied_C6H6 = lgb.Dataset(
    X_valid[features_list], 
    np.log1p(y_valid.loc[:, 'target_benzene'])
    )
train_NOx = lgb.Dataset(
    X_train[features_list], 
    np.log1p(y_train.loc[:, 'target_nitrogen_oxides'])
    )
valied_NOx = lgb.Dataset(
    X_valid[features_list], 
    np.log1p(y_valid.loc[:, 'target_nitrogen_oxides'])
    )

params = {
    'objective': 'regression',
    'metric': 'rmse',
#    'learning_rate': 0.01
    }

In [None]:
# Model learning('target_carbon_monoxide')
result_data = {}
model_CO = lgb.train(
    params = params,
    train_set = train_CO,
    valid_sets = [train_CO, valied_CO],
    num_boost_round = 300,
    early_stopping_rounds = 5,
    verbose_eval= 20,
    evals_result = result_data
)

In [None]:
plt.plot(result_data['training']['rmse'], color = 'Orange', label = 'train')
plt.plot(result_data['valid_1']['rmse'], color = 'blue', label = 'valid')
plt.legend()
plt.title('target_carbon_monoxide')
plt.show()

In [None]:
# Model learning('target_benzene')
result_data = {}
model_C6H6 = lgb.train(
    params = params,
    train_set = train_C6H6,
    valid_sets = [train_C6H6, valied_C6H6],
    num_boost_round = 300,
    early_stopping_rounds = 5,
    verbose_eval= 20,
    evals_result = result_data
)

In [None]:
plt.plot(result_data['training']['rmse'], color = 'Orange', label = 'train')
plt.plot(result_data['valid_1']['rmse'], color = 'blue', label = 'valid')
plt.legend()
plt.title('target_benzene')
plt.show()

In [None]:
# Model learning('target_nitrogen_oxides')
result_data = {}
model_NOx = lgb.train(
    params = params,
    train_set = train_NOx,
    valid_sets = [train_NOx, valied_NOx],
    num_boost_round = 300,
    early_stopping_rounds = 5,
    verbose_eval= 20,
    evals_result = result_data
)


In [None]:
plt.plot(result_data['training']['rmse'], color = 'Orange', label = 'train')
plt.plot(result_data['valid_1']['rmse'], color = 'blue', label = 'valid')
plt.legend()
plt.title('target_nitrogen_oxides')
plt.show()

In [None]:
# test_data predict
t_pred_CO = model_CO.predict(test_df[features_list])
t_pred_C6H6 = model_C6H6.predict(test_df[features_list])
t_pred_NOx = model_NOx.predict(test_df[features_list])

In [None]:
# Predicted value conversion
# t -> y (=np.expm1(y))

# submisson_file predict
submission['target_carbon_monoxide'] = np.expm1(t_pred_CO)
submission['target_benzene'] = np.expm1(t_pred_C6H6)
submission['target_nitrogen_oxides'] = np.expm1(t_pred_NOx)

submission.head()

In [None]:
# Prediction visualization(submisson_file)
test_date = pd.to_datetime(test_df.reset_index().date_time).apply(
    lambda x: x.strftime('%Y/%m/%d'))

test_preds = pd.DataFrame({'date': test_date,
                            'test_carbon_monoxide': submission['target_carbon_monoxide'],
                            'test_benzene': submission['target_benzene'],
                            'test_nitrogen_oxides': submission['target_nitrogen_oxides']})
test_preds = test_preds.groupby('date').mean()

test_preds.plot(color = ['red', 'darkblue', 'green'], subplots = True, figsize = (15, 10))
plt.xlabel('')
plt.show()

In [None]:
submission.to_csv('submission_LGBM_baseline.csv', index = False)