In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
import pickle
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [32]:
# The directory for top level folder
dir_ = "/home/sugam/Work/20-29 Deep Learning/22 Projects/Optimization of Energy Using AIML"

processed_data_dir = dir_+'/data/Processed/'
model_dir = dir_+'/models/'

In [25]:
# Load your dataset
train_df_part_1 = pd.read_pickle(processed_data_dir + "train_df_part_1.pkl")

In [26]:
train_df_part_1.head()

Unnamed: 0,load,year,month,day,hour,minute,weekday,week_num,is_weekdend,load_min,load_max,load_mean,load_std,load_norm
0,65.8125,2018,1,1,1,0,0,1,0,38.53125,162.0,75.9375,17.234375,0.53295
1,65.6875,2018,1,1,1,15,0,1,0,38.53125,162.0,75.9375,17.234375,0.532059
2,66.0,2018,1,1,1,30,0,1,0,38.53125,162.0,75.9375,17.234375,0.53457
3,65.1875,2018,1,1,1,45,0,1,0,38.53125,162.0,75.9375,17.234375,0.52801
4,70.6875,2018,1,1,2,0,0,1,0,38.53125,162.0,75.9375,17.234375,0.572638


In [27]:
# Check for remaining NaN values
nan_count = train_df_part_1.isna().sum().sum()
print(f"Total NaN values: {nan_count}")

Total NaN values: 0


In [28]:
# Split the dataset using fixed split 80% into training and 20% into testing
train_size = int(len(train_df_part_1) * 0.8)
train_data, test_data = train_df_part_1[:train_size], train_df_part_1[train_size:]
target_columns = ["load"] 

X_train, y_train = train_data.drop(target_columns, axis=1), train_data[target_columns]
X_test, y_test = test_data.drop(target_columns, axis=1), test_data[target_columns]

In [29]:
print(f"Shape of Train data: {X_train.shape}")
print(f"Shape of Test data: {X_test.shape}")

Shape of Train data: (82438, 13)
Shape of Test data: (20610, 13)


In [30]:
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_valid)
xgb_rmse = mean_squared_error(y_valid, xgb_predictions, squared=False)
print(f'XGBoost RMSE: {xgb_rmse}')

XGBoost RMSE: 9.3037748336792


In [None]:
import optuna
from sklearn.model_selection import cross_val_score
optuna.logging.set_verbosity(optuna.logging.WARNING)
from warnings import simplefilter
simplefilter("ignore", category=RuntimeWarning)


In [None]:
def objective(trial):
    criterion = trial.suggest_categorical("criterion",["gini","entropy"])
    max_depth = trial.suggest_int("max_depth",2,32,log=True)
    n_estimators = trial.suggest_int("n_estimators",100,500)
    boosters = trial.suggest_categorical("boosters",["gbtree","gblinear","dart"])
    eta = trial.suggest_float("eta",0.1,1)
    learning_rate = trial.suggest_float("learning_rate",0.001,1,log=True)
    min_child_weight = trial.suggest_int("min_child_weight",1,10)
    gamma = trial.suggest_float("gamma",0.0,0.5)
    
    xgb_model = XGBRegressor(criterion = criterion,
                            max_depth = max_depth,
                            n_estimator = n_estimators,
                            verbosity=1,
                            boosters = boosters,
                            eta = eta,
                            learning_rate = learning_rate,
                            min_child_weight = min_child_weight,
                            gamma = gamma)
    
    xgb_model.fit(X_train,y_train)
    rmse_score = mean_squared_error(y_train,xgb_model.predict(X_train),squared=False)
    return rmse_score


study = optuna.create_study(direction="minimize",
                            study_name="xgb_study")
study.optimize(objective, n_trials=20)
trial = study.best_trial
print(f"RMSE: {trial.value}")
print(f"Params: {trial.params}")

In [17]:
xgb_model = XGBRegressor(criterion='entropy',
                         max_depth =  18,
                         n_estimators = 173,
                         boosters = 'gblinear',
                         eta = 0.8321535613093416, 
                         learning_rate = 0.9327551352534427,
                         min_child_weight = 5,
                         gamma = 0.0014464638207411923)
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
xgb_rmse = mean_squared_error(y_test, xgb_predictions, squared=False)
print(f'XGBoost RMSE: {xgb_rmse}')

XGBoost RMSE: 3.1189017295837402


In [37]:
model_name = model_dir+'xgb_model_train_df_part_1.bin'
pickle.dump(xgb_model, open(model_name, 'wb'))