In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor

I quoted from the notebook below.
Thanks to the author for sharing it.

- https://www.kaggle.com/maksymshkliarevskyi/tps-july-eda-baseline-analysis-xgbregressor
- https://www.kaggle.com/dwin183287/tps-july-2021-eda


In [None]:
# import datasets
train_df = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test_df = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-jul-2021/sample_submission.csv')

# "Date-time" as time series data
train_df['date_time'] = pd.to_datetime(train_df['date_time'])
test_df['date_time'] = pd.to_datetime(test_df['date_time'])


In [None]:
train_df.info()

In [None]:
features_list = [
    'deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 
    'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5'
    ]

targets_list = [
    'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'
    ]

In [None]:
# Create data sets for training (80%) and validation (20%)
X_train, X_valid, y_train, y_valid = train_test_split(
    train_df, train_df[targets_list], test_size = 0.2, 
    random_state = 123, shuffle = False)

In [None]:
# The basic model
params = {'n_estimators': 400,
          'subsample': 0.8,
          'max_depth': 8,
          'learning_rate': 0.05,
          'n_jobs': -1,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.1,
          'reg_lambda': 0.1,
          'random_state': 0}

# Model learning
model_to_CO = XGBRegressor(**params).fit(
    X_train[features_list], y_train.loc[:, 'target_carbon_monoxide'])
model_to_C6H6 = XGBRegressor(**params).fit(
    X_train[features_list], y_train.loc[:, 'target_benzene'])
model_to_NOx =  XGBRegressor(**params).fit(
    X_train[features_list], y_train.loc[:, 'target_nitrogen_oxides'])


In [None]:
# Check the results.
y_pred_CO = model_to_CO.predict(X_valid[features_list])
print('RMSLE ({}): {}'.format(targets_list[0], 
    round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 0], y_pred_CO)), 4)))
y_pred_C6H6 = model_to_C6H6.predict(X_valid[features_list])
print('RMSLE ({}): {}'.format(targets_list[1], 
    round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 1], y_pred_C6H6)), 4)))
y_pred_NOx = model_to_NOx.predict(X_valid[features_list])
print('RMSLE ({}): {}'.format(targets_list[2], 
    round(np.sqrt(mean_squared_log_error(y_valid.iloc[:, 2], y_pred_NOx)), 4)))

In [None]:
# Prediction visualization

date = pd.to_datetime(X_valid.reset_index().date_time).apply(
    lambda x: x.strftime('%Y/%m/%d')
    )

valid_preds = pd.DataFrame({'date': date,
                            'target_carbon_monoxide': y_valid.iloc[:, 0].values,
                            'target_benzene': y_valid.iloc[:, 1].values,
                            'target_nitrogen_oxides': y_valid.iloc[:, 2].values,
                            'preds_carbon_monoxide': y_pred_CO,
                            'preds_benzene': y_pred_C6H6,
                            'preds_nitrogen_oxides': y_pred_NOx})
valid_preds = valid_preds.groupby('date').mean()

In [None]:
# CO prediction result
plt.figure(figsize = (15, 5))
valid_preds['target_carbon_monoxide'].plot(color = 'blue')
valid_preds['preds_carbon_monoxide'].plot(color = 'Orange')
plt.legend()
plt.xlabel('')
plt.show()

In [None]:
# C6H6 prediction result
plt.figure(figsize = (15, 5))
valid_preds['target_benzene'].plot(color = 'blue')
valid_preds['preds_benzene'].plot(color = 'Orange')
plt.legend()
plt.xlabel('')
plt.show()

In [None]:
# NOx prediction result
plt.figure(figsize = (15, 5))
valid_preds['target_nitrogen_oxides'].plot(color = 'blue')
valid_preds['preds_nitrogen_oxides'].plot(color = 'Orange')
plt.legend()
plt.xlabel('')
plt.show()

In [None]:
# Train model on all the data
params = {'n_estimators': 400,
          'subsample': 0.8,
          'max_depth': 8,
          'learning_rate': 0.05,
          'n_jobs': -1,
          'colsample_bytree': 0.8,
          'reg_alpha': 0.1,
          'reg_lambda': 0.1,
          'random_state': 0}

# Model learning
model_to_CO = XGBRegressor(**params).fit(
    train_df[features_list], train_df.loc[:, 'target_carbon_monoxide'])
model_to_C6H6 = XGBRegressor(**params).fit(
    train_df[features_list], train_df.loc[:, 'target_benzene'])
model_to_NOx =  XGBRegressor(**params).fit(
    train_df[features_list], train_df.loc[:, 'target_nitrogen_oxides'])

In [None]:
# submisson_file predict
submission['target_carbon_monoxide'] = model_to_CO.predict(test_df[features_list])
submission['target_benzene'] = model_to_C6H6.predict(test_df[features_list])
submission['target_nitrogen_oxides'] = model_to_NOx.predict(test_df[features_list])

submission.head()

In [None]:
# Prediction visualization(submisson_file)
test_date = pd.to_datetime(test_df.reset_index().date_time).apply(
    lambda x: x.strftime('%Y/%m/%d'))

test_preds = pd.DataFrame({'date': test_date,
                            'test_carbon_monoxide': submission['target_carbon_monoxide'],
                            'test_benzene': submission['target_benzene'],
                            'test_nitrogen_oxides': submission['target_nitrogen_oxides']})
test_preds = test_preds.groupby('date').mean()

In [None]:
# Prediction visualization(submisson_file)
test_preds.plot(color = ['red', 'darkblue', 'green'], subplots = True, figsize = (15, 10))
plt.xlabel('')
plt.show()

In [None]:
submission.to_csv('submission_xgboost_baseline.csv', index = False)