In [31]:
# importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels

In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [33]:
from xgboost import XGBRegressor
from skopt import BayesSearchCV
from skopt.space import Real, Integer
from keras_tuner.tuners import BayesianOptimization

In [34]:
#import pandas as pd
#import numpy as np
from xgboost import XGBRegressor
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout,GRU
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit


In [35]:
# load the data into dataframe
df=pd.read_csv('feature_engineered_data.csv')
print(df.head())

   current_value  lights         T1       RH_1         T2       RH_2  \
0          430.0      30  20.133333  48.000000  19.566667  44.400000   
1          250.0      30  20.260000  52.726667  19.730000  45.100000   
2          100.0      10  20.426667  55.893333  19.856667  45.833333   
3          100.0      10  20.566667  53.893333  20.033333  46.756667   
4           90.0      10  20.730000  52.660000  20.166667  47.223333   

          T3       RH_3         T4       RH_4  ...  is_weekend    nsm  lag_1  \
0  19.890000  44.900000  19.000000  46.363333  ...           0  68400  576.6   
1  19.890000  45.493333  19.000000  47.223333  ...           0  69000  430.0   
2  20.033333  47.526667  19.000000  48.696667  ...           0  69600  250.0   
3  20.100000  48.466667  19.000000  48.490000  ...           0  70200  100.0   
4  20.200000  48.530000  18.926667  48.156667  ...           0  70800  100.0   

   lag_2  lag_6  lag_12  hour_sin  hour_cos  rolling_mean_12  rolling_std_12  
0  230.

In [36]:
def implement_model(df,selected_features,mins):
    step=mins//10

    df['target']= df['current_value'].shift(-step)
    df = df.dropna().reset_index(drop=True)
    
    df = df[selected_features + ['target']]

    X=df.drop(columns=['target'])
    y=df['target']

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Train-test split (chronologically for time series)
    split_idx = int(len(X_scaled) * 0.8)
    X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=4)
    xgb.fit(X_train, y_train)

    # Predict on training and test
    xgb_train_preds = xgb.predict(X_train)
    xgb_test_preds = xgb.predict(X_test)

    # Stack original inputs + xgb prediction
    X_train_lstm = np.hstack((X_train, xgb_train_preds.reshape(-1, 1)))
    X_test_lstm = np.hstack((X_test, xgb_test_preds.reshape(-1, 1)))

    # Reshape for LSTM: (samples, timesteps, features)
    X_train_lstm = X_train_lstm.reshape((X_train_lstm.shape[0], 1, X_train_lstm.shape[1]))
    X_test_lstm = X_test_lstm.reshape((X_test_lstm.shape[0], 1, X_test_lstm.shape[1]))

    model = Sequential()
    model.add(GRU(64, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    model.fit(X_train_lstm, y_train, epochs=20, batch_size=64, validation_split=0.1, verbose=1,shuffle=False)

    y_pred = model.predict(X_test_lstm)

# Evaluation
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Minutes:{mins} :- RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")

    return


In [37]:
import pickle

# Load the selected features for 10-minute forecasting
with open("selected_features_10min.pkl", "rb") as f:
    selected_features_10 = pickle.load(f)

print("Selected Features:", selected_features_10)

implement_model(df,selected_features_10,10)

Selected Features: ['current_value', 'RH_5', 'T6', 'T8', 'RH_8', 'Press_mm_hg', 'nsm', 'lag_1', 'rolling_mean_12', 'rolling_std_12']
Epoch 1/20


  super().__init__(**kwargs)


[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 12616.1992 - mae: 66.3328 - val_loss: 2169.4253 - val_mae: 21.5339
Epoch 2/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3288.4795 - mae: 29.1248 - val_loss: 2164.6760 - val_mae: 21.5669
Epoch 3/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3262.6765 - mae: 29.0421 - val_loss: 2165.3875 - val_mae: 21.4673
Epoch 4/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3277.0188 - mae: 29.1719 - val_loss: 2159.9885 - val_mae: 21.5159
Epoch 5/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3228.5696 - mae: 28.7276 - val_loss: 2156.6033 - val_mae: 21.5890
Epoch 6/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3273.5906 - mae: 29.0696 - val_loss: 2156.5088 - val_mae: 21.4781
Epoch 7/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━

In [38]:
with open("selected_features_60min.pkl", "rb") as f:
    selected_features_60 = pickle.load(f)

print("Selected Features:", selected_features_60)

implement_model(df,selected_features_60,60)

Selected Features: ['current_value', 'T3', 'RH_5', 'RH_8', 'T_out', 'Press_mm_hg', 'nsm', 'hour_cos', 'rolling_mean_12', 'rolling_std_12']
Epoch 1/20


  super().__init__(**kwargs)


[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14268.6934 - mae: 70.0530 - val_loss: 4471.5249 - val_mae: 35.2350
Epoch 2/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6087.2612 - mae: 45.2242 - val_loss: 4436.9380 - val_mae: 34.0956
Epoch 3/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6025.2886 - mae: 44.4981 - val_loss: 4458.3867 - val_mae: 33.4747
Epoch 4/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6006.0586 - mae: 44.2724 - val_loss: 4413.8379 - val_mae: 33.6329
Epoch 5/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 6019.4824 - mae: 44.3105 - val_loss: 4400.0312 - val_mae: 33.5870
Epoch 6/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5966.8931 - mae: 43.7464 - val_loss: 4374.5293 - val_mae: 33.8256
Epoch 7/20
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━

In [39]:
def hyperparameter_tuning(df,selected_features,mins):
    step=mins//10

    df['target']= df['current_value'].shift(-step)
    df = df.dropna().reset_index(drop=True)
    
    df = df[selected_features + ['target']]

    X=df.drop(columns=['target'])
    y=df['target']

    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    target_scaler = MinMaxScaler()
    y_scaled = target_scaler.fit_transform(y.values.reshape(-1, 1))
    #y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

    # Train-test split (chronologically for time series)
    split_idx = int(len(X_scaled) * 0.8)
    X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
    y_train, y_test = y_scaled[:split_idx], y_scaled[split_idx:]

    

    search_spaces = {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(3, 10),
        'learning_rate': Real(0.01, 0.3, 'log-uniform'),
        'subsample': Real(0.6, 1.0),
        'colsample_bytree': Real(0.6, 1.0)
    }
    tscv = TimeSeriesSplit(n_splits=10)

    opt = BayesSearchCV(
        XGBRegressor(),
        search_spaces,
        n_iter=20,
        scoring='r2',
        cv=tscv,
        verbose=1,
        random_state=42
    )

    opt.fit(X_train, y_train)
    best_xgb = opt.best_estimator_

    # Get XGB predictions
    xgb_train_preds = best_xgb.predict(X_train).reshape(-1, 1)
    xgb_test_preds = best_xgb.predict(X_test).reshape(-1, 1)

    # Combine XGB predictions with original features
    X_train_gru = np.hstack((X_train, xgb_train_preds))
    X_test_gru = np.hstack((X_test, xgb_test_preds))

    # Reshape for GRU (samples, timesteps=1, features)
    X_train_gru = X_train_gru.reshape((X_train_gru.shape[0], 1, X_train_gru.shape[1]))
    X_test_gru = X_test_gru.reshape((X_test_gru.shape[0], 1, X_test_gru.shape[1]))

    def build_gru_model(hp):
        model = Sequential()
        model.add(GRU(
            units=hp.Int('units', min_value=32, max_value=128, step=32),
            activation='tanh',
            input_shape=(X_train_gru.shape[1], X_train_gru.shape[2])
        ))
        model.add(Dropout(hp.Float('dropout', 0.1, 0.5, step=0.1)))
        model.add(Dense(1))
        model.compile(
            optimizer='adam',
            loss='mse',
            metrics=['mae']
        )
        return model
    
    tuner = BayesianOptimization(
        build_gru_model,
        objective='val_mae',
        max_trials=15,
        directory='xgboost_gru_tuning',
        project_name=f'xgb_gru{mins}'
    )

    tuner.search(X_train_gru, y_train, epochs=20, validation_split=0.2, batch_size=64, verbose=1,shuffle=False)

    # Get best model
    best_gru_model = tuner.get_best_models(num_models=1)[0]

    y_pred = best_gru_model.predict(X_test_gru)

    y_pred_actual = target_scaler.inverse_transform(y_pred)
    y_test_actual = target_scaler.inverse_transform(y_test)

    rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred_actual))
    mae = mean_absolute_error(y_test_actual, y_pred_actual)
    r2 = r2_score(y_test_actual, y_pred_actual)

    print(f" XGB-GRU {mins} mins:- RMSE: {rmse:.2f}, MAE: {mae:.2f}, R²: {r2:.4f}")
    return

In [40]:
hyperparameter_tuning(df,selected_features_10,10)

Trial 15 Complete [00h 00m 12s]
val_mae: 0.046236272901296616

Best val_mae So Far: 0.042810313403606415
Total elapsed time: 00h 02m 44s
[1m 74/124[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 767us/step

  saveable.load_own_variables(weights_store.get(inner_path))


[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
 XGB-GRU 10 mins:- RMSE: 55.43, MAE: 24.92, R²: 0.5933


In [41]:
hyperparameter_tuning(df,selected_features_60,60)

Trial 15 Complete [00h 00m 10s]
val_mae: 0.0764235407114029

Best val_mae So Far: 0.07605809718370438
Total elapsed time: 00h 02m 43s
[1m 67/124[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 780us/step

  saveable.load_own_variables(weights_store.get(inner_path))


[1m124/124[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
 XGB-GRU 60 mins:- RMSE: 76.32, MAE: 42.30, R²: 0.2281
