In [None]:
import pandas as pd
import numpy as np

import datetime as dt

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
sns.set(rc={'figure.figsize':(11.7,8.27)})

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

from lightgbm import LGBMRegressor

In [None]:
train = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
train.head()

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
test.head()

In [None]:
train.shape, test.shape

In [None]:
train.isnull().sum()

In [None]:
train_date = train.date_time
test_date = test.date_time

In [None]:
# separate target variables
target_vars = train[['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']]

In [None]:
# check for target_vars distribution
for col in target_vars.columns:
    sns.histplot(x=col, data=train, kde=True)
    plt.show()

In [None]:
# log1p normalizing target_vars
for target_var in target_vars.columns:
    train[target_var] = np.log1p(train[target_var])
    
for col in target_vars.columns:
    sns.histplot(x=col, data=train, kde=True)
    plt.show()

In [None]:
# get shape of rows to separate train data from all data in future
ntrain = train.shape[0]

# create a target_vars columns in test data and fill it with NaN to create all_data
for target_var in target_vars:
    test[target_var] = np.NaN

# train data + test data
all_data = pd.concat([train, test])

# make sure we haven't lost data
print('The size of a train dataset is:', train.shape)
print('The size of a test dataset is:', test.shape)
print('The size of both train and test datasets:', all_data.shape)

In [None]:
sns.heatmap(all_data.corr())

In [None]:
corrmat = all_data.corr()
k = 10 
for col in target_vars.columns:
    cols = corrmat.nlargest(k, col)[col].index
    cm = np.corrcoef(train[cols].values.T)
    sns.set(font_scale=1.25)
    hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
    plt.show()

In [None]:
for i, col in enumerate(all_data.columns[1:-3]):
    plt.subplot(4, 2, i + 1)
    sns.histplot(x=col, data=all_data)
    plt.title(col)

In [None]:
# a little feature engineering with time
date =  pd.to_datetime(all_data.date_time)
all_data['hour'] = date.dt.hour
all_data['is_weekend'] = date.dt.day_name().isin(['Saturday', 'Sunday'])*1
all_data['working_hours'] = all_data['hour'].isin(np.arange(8, 21, 1)).astype('int')
all_data.drop('hour', axis=1, inplace=True)

In [None]:
def date_time_encoding(f_name, f_itself, max_val):
        all_data['sin_' + f_name] = np.sin(2 * np.pi * (f_itself/max_val))
        all_data['cos_' + f_name] = np.cos(2 * np.pi * (f_itself/max_val))
        all_data['tan_' + f_name] = np.tan(2 * np.pi * (f_itself/max_val))

In [None]:
datetime = pd.to_datetime(all_data['date_time'])
date = pd.DataFrame(datetime.dt.month)
day  = pd.DataFrame(datetime.dt.day)
time = pd.DataFrame(datetime.dt.hour)

In [None]:
date_time_encoding('time', time, 24)
date_time_encoding('date', date, 12)
date_time_encoding('day' , day,  31)

In [None]:
all_data.drop('date_time', axis=1, inplace=True)

In [None]:
# separating train and test data
X_train = all_data[:ntrain].drop(target_vars, axis=1)
X_test = all_data[ntrain:].drop(target_vars, axis=1)

In [None]:
# make sure we didn't miss anything 
print('The size of a train dataset is:', X_train.shape)
print('The size of a test dataset is:', X_test.shape)
print('The size of both datasets:', all_data.shape)

In [None]:
y_1 = all_data['target_carbon_monoxide'][:ntrain]
y_2 = all_data['target_benzene'][:ntrain]
y_3 = all_data['target_nitrogen_oxides'][:ntrain]

In [None]:
# make sure to make sure
print('The size of the target_carbon_monoxide vector is:', y_1.shape[0])
print('The size of the target_benzene vector is:', y_2.shape[0])
print('The size of the target_nitrogen_oxides vector is', y_3.shape[0])
print('The size of the X_train is:', X_train.shape[0])

In [None]:
X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split(X_train,
                                                              y_1,
                                                              test_size=0.3,
                                                              random_state=17)

In [None]:
X_train_2, X_valid_2, y_train_2, y_valid_2 = train_test_split(X_train,
                                                              y_2,
                                                              test_size=0.3,
                                                              random_state=17)

In [None]:
X_train_3, X_valid_3, y_train_3, y_valid_3 = train_test_split(X_train,
                                                              y_3,
                                                              test_size=0.3,
                                                              random_state=17)

In [None]:
# creating lgbm model (params were found with GridSearchCV)
lgbm = LGBMRegressor(random_state=17,
                       max_depth=11,
                       n_estimators=500,
                       num_leaves=45,
                       subsample=0.7,
                       verbosity=-1)

# GridSearchCV for learning rate
lgbm_learning_rate = {'learning_rate': np.logspace(-3, 0, 10)}

grid_search_lgbm = GridSearchCV(estimator=lgbm, param_grid=lgbm_learning_rate, cv=5, verbose=1, n_jobs=-1)

In [None]:
# model for carbon_monoxide
grid_search_lgbm.fit(X_train_1, y_train_1)
best_lgbm_1 = grid_search_lgbm.best_estimator_
print('Mean Squared Error for carbon_monoxide: ', mean_squared_error(y_valid_1, best_lgbm_1.predict(X_valid_1)))

In [None]:
# model for benzene
grid_search_lgbm.fit(X_train_2, y_train_2)
best_lgbm_2 = grid_search_lgbm.best_estimator_
print('Mean Squared Error for benzene: ', mean_squared_error(y_valid_2, best_lgbm_2.predict(X_valid_2)))

In [None]:
# model for nitrogen_oxides
grid_search_lgbm.fit(X_train_3, y_train_3)
best_lgbm_3 = grid_search_lgbm.best_estimator_
print('Mean Squared Error for nitrogen_oxides: ', mean_squared_error(y_valid_3, best_lgbm_3.predict(X_valid_3)))

In [None]:
# transforming target_vars from log and making preds 
lgbm_pred_1 = np.expm1(best_lgbm_1.predict(X_test))
lgbm_pred_2 = np.expm1(best_lgbm_2.predict(X_test))
lgbm_pred_3 = np.expm1(best_lgbm_3.predict(X_test))

In [None]:
# create dataframe and save data
sub = pd.DataFrame()
sub['date_time'] = test_date
sub['target_carbon_monoxide'] = lgbm_pred_1
sub['target_benzene'] = lgbm_pred_2
sub['target_nitrogen_oxides'] = lgbm_pred_3
sub.to_csv('submission_1.csv',index=False)

In [None]:
sub