In [46]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit

In [18]:
df_train = pd.read_csv('../data/custom/df_train.csv')
df_train['DATETIME'] = pd.to_datetime(df_train['DATETIME'])

In [19]:
# Splitting the data
split_index = int(len(df_train) * 0.8)
train = df_train.iloc[:split_index]
test = df_train.iloc[split_index:]

# Separating the target variable
X_train = train.drop(['ND_TARGET'], axis=1)
y_train = train['ND_TARGET']
X_test = test.drop(['ND_TARGET'], axis=1)
y_test = test['ND_TARGET']

In [20]:
X_train['year'] = X_train['DATETIME'].dt.year
X_train['month'] = X_train['DATETIME'].dt.month
X_train['day'] = X_train['DATETIME'].dt.day
X_train['hour'] = X_train['DATETIME'].dt.hour
X_train = X_train.drop(['DATETIME'], axis=1)

X_test['year'] = X_test['DATETIME'].dt.year
X_test['month'] = X_test['DATETIME'].dt.month
X_test['day'] = X_test['DATETIME'].dt.day
X_test['hour'] = X_test['DATETIME'].dt.hour
X_test = X_test.drop(['DATETIME'], axis=1)

In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [22]:
params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'eta': .3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
}

In [23]:
num_boost_round = 999

model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:4507.11759
[1]	Test-rmse:3245.65030
[2]	Test-rmse:2373.63926
[3]	Test-rmse:1768.07043
[4]	Test-rmse:1368.07757
[5]	Test-rmse:1095.72042
[6]	Test-rmse:921.23112
[7]	Test-rmse:814.95090
[8]	Test-rmse:745.04220
[9]	Test-rmse:701.04809
[10]	Test-rmse:672.08920
[11]	Test-rmse:653.06042
[12]	Test-rmse:641.56057
[13]	Test-rmse:628.83580
[14]	Test-rmse:624.23496
[15]	Test-rmse:616.40858
[16]	Test-rmse:614.74832
[17]	Test-rmse:613.63367
[18]	Test-rmse:611.67255
[19]	Test-rmse:607.91665
[20]	Test-rmse:607.09141
[21]	Test-rmse:606.56220
[22]	Test-rmse:603.03146
[23]	Test-rmse:599.94606
[24]	Test-rmse:599.42068
[25]	Test-rmse:599.89605
[26]	Test-rmse:599.77337
[27]	Test-rmse:599.46312
[28]	Test-rmse:596.23165
[29]	Test-rmse:596.43042
[30]	Test-rmse:594.81435
[31]	Test-rmse:592.51305
[32]	Test-rmse:591.53164
[33]	Test-rmse:591.25941
[34]	Test-rmse:590.18407
[35]	Test-rmse:588.08455
[36]	Test-rmse:589.17497
[37]	Test-rmse:587.98191
[38]	Test-rmse:585.15303
[39]	Test-rmse:585.72767
[40]

In [24]:
y_pred = model.predict(dtest)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 339148.8548441722
R2 Score: 0.9907757789874675


In [39]:
## Grid Search CV
param_grid = {
    'max_depth': [5, 6, 7],
    'min_child_weight': [8, 10, 12],
    'n_estimators': [400, 500, 550],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'learning_rate': [0.08, 0.1, 0.12]
}

In [40]:
tscv = TimeSeriesSplit(n_splits=5)

In [41]:
xgb_reg = XGBRegressor(objective='reg:squarederror')
grid_search = GridSearchCV(estimator=xgb_reg, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=tscv, verbose=1, n_jobs=-1)

In [42]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [43]:
print("Best parameters:", grid_search.best_params_)
print("Best score:", -grid_search.best_score_)

Best parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.08, 'max_depth': 7, 'min_child_weight': 10, 'n_estimators': 550}
Best score: 408277.1981523767


In [44]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [45]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 290220.8383213791
R2 Score: 0.9921065304603524


In [49]:
filename = 'XGB_grid.sav'
pickle.dump(best_model, open('models_sav/' + filename, 'wb'))

In [None]:
loaded_model = pickle.load(open('models_sav' + filename, 'rb'))