In [20]:
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBRegressor
import pandas as pd

In [21]:
# Importing data
df = pd.read_csv("anomaly_removed.csv")
df.head(2)

Unnamed: 0,Timestamp,OxEnRa,BlFuPeIn,EnOxFl,CoBlFl,BlMo,BlFuBoGaVo,BlFuBoGaIn,ThCoTe,ToGaPr,...,CoBlTe,HoBlTe,ToTe,BlHu,CoInSeVa,FoSI,HoBl,ToGasP,CoBF,SI
0,2021-05-12 17:56:25.000,3.691772,17.661523,16941.90435,36.47267,146.285825,7925.957227,81.975757,2210.712256,224.36648,...,212.936183,1087.730999,1.79588,13.118669,45.994059,0.481272,1075.254532,221.600735,36.796521,0.479
1,2021-05-12 17:56:25.000,3.744628,0.193811,17554.954071,34.087434,161.762571,8049.51886,81.637472,2268.780322,0.005692,...,213.288686,2.082851,1.79588,10.353929,1.580861,0.337968,2.110829,0.005584,34.238289,0.312456


In [22]:
df.shape

(5366, 25)

In [23]:
# Using grid search to determine the best parameters

X, y = df.iloc[:,1:-1].values, df.iloc[:,-1].values

xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'gamma': [0.3, 0.5, 1]
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=kfold,
    verbose=3,
    n_jobs=-1
)

grid_search.fit(X, y)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


In [24]:
# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (MSE):", -grid_search.best_score_)

Best Parameters: {'gamma': 0.3, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.8}
Best Score (MSE): 0.007281271716580715
