<h1 style="text-align:center;"> Tabular Playground Series - July 2021 </h1>
<h2 style="text-align:center;"> XGBoost Implementation </h2>
<h3 style="text-align:center;"> by Tariq Hussain </h3>

# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading and validating the data

In [None]:
train_filepath = '../input/tabular-playground-series-jul-2021/train.csv'
train_data = pd.read_csv(train_filepath, index_col='date_time', parse_dates=['date_time'])
#train_data['date_time'] = pd.to_datetime(train_data['date_time'])

test_filepath = '../input/tabular-playground-series-jul-2021/test.csv'
test_data = pd.read_csv(test_filepath, index_col='date_time', parse_dates=['date_time'])
#test_data['date_time'] = pd.to_datetime(test_data['date_time'])


In [None]:
train_data

In [None]:
print(train_data.info())

In [None]:
test_data

In [None]:
print(test_data.info())

In [None]:
train_data.isnull().values.any()

In [None]:
train_data = train_data.iloc[:-1, :]

train_data

In [None]:
'''
plt.figure(figsize=(10,6))
air_pol_data['relative_humidity'].asfreq('M').plot()

plt.title('Relative humidity over time (by month)')
plt.show()
'''

# Exploratory Data Analysis

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='relative_humidity')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='absolute_humidity')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='sensor_1')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='sensor_2')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='sensor_3')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='sensor_4')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='sensor_5')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='target_carbon_monoxide')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='target_benzene')

In [None]:
plt.figure(figsize=(20,6))
sns.lineplot(data=train_data, x='date_time', y='target_nitrogen_oxides')

In [None]:
train_data_corr = train_data.corr()
mask = np.triu(np.ones_like(train_data_corr, dtype=np.bool))

fig = plt.figure(figsize=(16,10))
sns.heatmap(train_data_corr, mask=mask)

## Preparing and pre-processing the data

In [None]:
#train = train_data.drop('date_time', axis=1)
train = train_data.copy()
#test = train_data.drop('date_time', axis=1)
test = test_data.copy()

In [None]:
cols = [
        'target_carbon_monoxide',
        'target_benzene',
        'target_nitrogen_oxides'
       ]

for i in ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']:
    train[i] = np.log(train[i])
    
y_co = train.target_carbon_monoxide
y_ben = train.target_benzene
y_no = train.target_nitrogen_oxides

#y_co_log = np.log1p(y_co)
#y_ben_log = np.log1p(y_ben)
#y_no_log = np.log1p(y_no)

X = train.drop(cols, axis=1)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [None]:
display(y_co)

In [None]:
display(y_ben)

In [None]:
display(y_no)

## The XBoost Models

In [None]:
from sklearn.model_selection import train_test_split

X_co_train, X_co_val, y_co_train, y_co_val = train_test_split(X_scaled, y_co, random_state=0, test_size=0.5)
X_ben_train, X_ben_val, y_ben_train, y_ben_val = train_test_split(X_scaled, y_ben, random_state=0, test_size=0.5)
X_no_train, X_no_val, y_no_train, y_no_val = train_test_split(X_scaled, y_no, random_state=0, test_size=0.5)

In [None]:
#display(y_train)

In [None]:
from xgboost import XGBRegressor
'''
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

xgbr = XGBRegressor(seed=20)

params = { "max_depth": [2, 3, 4],
           "learning_rate": [0.03, 0.04, 0.05],
           "n_estimators": np.arange(100, 1000, 100),
           "colsample_bytree": np.arange(0.2, 0.7, 0.1),
            "colsample_bylevel": np.arange(0.2, 0.7, 0.1),
            "colsample_bynode": np.arange(0.2, 0.7, 0.1)
            }

reg = RandomizedSearchCV(estimator=xgbr,
                  param_distributions=params,
                  scoring='neg_mean_squared_error',
                   n_iter=50,
                  verbose=1)
'''

In [None]:
#reg.fit(X_train, y_co_train)

#print("Best parameters for carbon monoxide:", reg.best_params_)

In [None]:
#reg.fit(X_train, y_ben_train)

#print("Best parameters for benzene:", reg.best_params_)

In [None]:
#reg.fit(X_train, y_no_train)

#print("Best parameters for nitrogen oxide:", reg.best_params_)

In [None]:

params_1 = {'n_estimators': 900, 
          'learning_rate': 0.01, 
          'max_depth': 4,
          'colsample_bytree': 0.5000000000000001, 
          'colsample_bylevel': 0.2}

model_co = XGBRegressor(**params_1).fit(
    X_co_train, 
    y_co_train,
    eval_set=[(X_co_val, y_co_val)],
    early_stopping_rounds=10,
)


In [None]:

params_2 = {'n_estimators': 900, 
          'learning_rate': 0.01, 
          'max_depth': 4,
          'colsample_bytree': 0.6000000000000001, 
          'colsample_bylevel': 0.5000000000000001}

model_ben = XGBRegressor(**params_2).fit(
    X_ben_train, 
    y_ben_train,
    eval_set=[(X_ben_val, y_ben_val)],
    early_stopping_rounds=10,
)


In [None]:

params_3 = {'n_estimators': 900, 
          'learning_rate': 0.01, 
          'max_depth': 4,
          'colsample_bytree': 0.5000000000000001, 
          'colsample_bylevel': 0.2}

model_no = XGBRegressor(**params_3).fit(
    X_no_train, 
    y_no_train,
    eval_set=[(X_no_val, y_no_val)],
    early_stopping_rounds=10,
)


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score

pred_co = model_co.predict(X_co_val)
pred_ben = model_ben.predict(X_ben_val)
pred_no = model_no.predict(X_no_val)

print('target_carbon_monoxide\n')
#co_val = y_val.iloc[:, 0]
mae_co = "Mean absolute error: {}".format(mean_absolute_error(y_co_val, pred_co))
print(mae_co)
#print("Mean absolute error: {}\n".format(mean_absolute_error(y_val, pred_co)))
print("r2 score: {}\n".format(r2_score(y_co_val, pred_co)))
#accuracy_co = accuracy_score(co_val, pred_co)
#print("Accuracy: {}".format(accuracy_co * 100))

print('target_benzene\n')
#mae_ben =
#ben_val = y_val.iloc[:, 1]
print("Mean absolute error: {}".format(mean_absolute_error(y_ben_val, pred_ben)))
print("r2 score: {}\n".format(r2_score(y_ben_val, pred_ben)))
#accuracy_ben = accuracy_score(ben_val, pred_ben)
#print("Accuracy: {}".format(accuracy_ben * 100))

print('target_nitrogen_oxide\n')
#mae_no = 
#no_val = y_val.iloc[:, 2]
print("Mean absolute error: {}".format(mean_absolute_error(y_no_val, pred_no)))
print("r2 score: {}\n".format(r2_score(y_no_val, pred_no)))
#accuracy_no = accuracy_score(ben_val, pred_no)
#print("Accuracy: {}".format(accuracy_no * 100))

In [None]:
#final_test = test.drop(['target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides'], axis=0)
#display(final_test)

In [None]:
test_scaled = scaler.transform(test)

In [None]:
final_preds_co = model_co.predict(test_scaled)
final_preds_co = np.exp(final_preds_co)

final_preds_ben = model_ben.predict(test_scaled)
final_preds_ben = np.exp(final_preds_ben)

final_preds_no = model_no.predict(test_scaled)
final_preds_no = np.exp(final_preds_no)

In [None]:
print("'target_carbon_monoxide':")
print("Mean absolute error: {}".format(mean_absolute_error(y_co[:2247], final_preds_co)))
print("r2 score: {}\n".format(r2_score(y_co[:2247], final_preds_co)))

print("'target_benzene':")
print("Mean absolute error: {}".format(mean_absolute_error(y_ben[:2247], final_preds_co)))
print("r2 score: {}\n".format(r2_score(y_ben[:2247], final_preds_co)))

print("'target_nitrogen_oxides':")
print("Mean absolute error: {}".format(mean_absolute_error(y_no[:2247], final_preds_co)))
print("r2 score: {}\n".format(r2_score(y_no[:2247], final_preds_co)))

In [None]:
sample_sub_fp = '../input/tabular-playground-series-jul-2021/sample_submission.csv'
sample_sub = pd.read_csv(sample_sub_fp)

In [None]:
sample_sub

In [None]:
export = sample_sub.copy()

export['target_carbon_monoxide'] = final_preds_co[:2247]
export['target_benzene'] = final_preds_ben[:2247]
export['target_nitrogen_oxides'] = final_preds_no[:2247]

In [None]:
export

In [None]:
export.to_csv('submission.csv', index=False)