In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels

In [2]:
# load the data into dataframe
df=pd.read_csv('feature_engineered_data.csv')
print(df.head())

   current_value  lights         T1       RH_1         T2       RH_2  \
0          430.0      30  20.133333  48.000000  19.566667  44.400000   
1          250.0      30  20.260000  52.726667  19.730000  45.100000   
2          100.0      10  20.426667  55.893333  19.856667  45.833333   
3          100.0      10  20.566667  53.893333  20.033333  46.756667   
4           90.0      10  20.730000  52.660000  20.166667  47.223333   

          T3       RH_3         T4       RH_4  ...  is_weekend    nsm  lag_1  \
0  19.890000  44.900000  19.000000  46.363333  ...           0  68400  576.6   
1  19.890000  45.493333  19.000000  47.223333  ...           0  69000  430.0   
2  20.033333  47.526667  19.000000  48.696667  ...           0  69600  250.0   
3  20.100000  48.466667  19.000000  48.490000  ...           0  70200  100.0   
4  20.200000  48.530000  18.926667  48.156667  ...           0  70800  100.0   

   lag_2  lag_6  lag_12  hour_sin  hour_cos  rolling_mean_12  rolling_std_12  
0  230.

In [3]:
import pickle

# Load the selected features for 10-minute forecasting
with open("selected_features_10min.pkl", "rb") as f:
    selected_features = pickle.load(f)

print("Selected Features:", selected_features)

Selected Features: ['current_value', 'RH_5', 'T6', 'T8', 'RH_8', 'Press_mm_hg', 'nsm', 'lag_1', 'rolling_mean_12', 'rolling_std_12']


In [4]:
df['target_10min']= df['current_value'].shift(-1)

In [5]:
df = df.dropna().reset_index(drop=True)

In [6]:

df = df[selected_features + ['target_10min']]

In [7]:
df.head()

Unnamed: 0,current_value,RH_5,T6,T8,RH_8,Press_mm_hg,nsm,lag_1,rolling_mean_12,rolling_std_12,target_10min
0,430.0,55.09,6.123333,18.066667,48.56,734.5,68400,576.6,115.55,153.488504,250.0
1,250.0,55.163333,6.0675,18.033333,48.666667,734.616667,69000,430.0,146.383333,176.720271,100.0
2,100.0,55.5,5.9,18.1,49.193333,734.733333,69600,250.0,162.216667,176.788676,100.0
3,100.0,56.0425,5.8,18.15,49.2,734.85,70200,100.0,166.383333,174.477557,90.0
4,90.0,56.49,5.726667,18.23,49.633333,734.966667,70800,100.0,170.55,172.02535,70.0


In [8]:
X=df.drop(columns=['target_10min'])
y=df['target_10min']

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [10]:
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

In [11]:
scaler_X = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [13]:
def run_model(model_name, model ,X_train_scaled,y_train,X_test_scaled):
    model.fit(X_train_scaled,y_train)

    y_pred=model.predict(X_test_scaled)

    r2 = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse  = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print evaluation metrics
    print("Model Name:",model_name)
    print("R² Score:", r2)
    print("MAE:", mae)
    print("RMSE:", rmse)

    return


In [14]:
import xgboost
from xgboost import XGBRegressor

In [15]:
xg = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
run_model('XgBoost',xg ,X_train_scaled,y_train,X_test_scaled)

Model Name: XgBoost
R² Score: 0.5763691249658942
MAE: 27.05437581747656
RMSE: 56.57802841443363


In [16]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

run_model('Random Forest',rf ,X_train_scaled,y_train,X_test_scaled)

Model Name: Random Forest
R² Score: 0.5290284470439616
MAE: 30.742385497795762
RMSE: 59.65561745268619


In [17]:
from sklearn.linear_model import LinearRegression

# Train the model
lr = LinearRegression()

run_model('Linear Regression',lr ,X_train_scaled,y_train,X_test_scaled)

Model Name: Linear Regression
R² Score: 0.5679236799512523
MAE: 26.72774515191798
RMSE: 57.139211075673316


# Hyperparameter Tuning

In [18]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, TimeSeriesSplit

In [19]:
def evaluate(model,y_pred,y_test):
    r2 = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse  = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print evaluation metrics
    print("Model Name:",model)
    print("R² Score:", r2)
    print("MAE:", mae)
    print("RMSE:", rmse)

In [None]:
tscv = TimeSeriesSplit(n_splits=5)

cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv=tscv, scoring='r2')
print("Linear Regression Cross-Validated R²:", np.mean(cv_scores))

lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
evaluate("Linear Regression",y_pred_lr,y_test)


Linear Regression Cross-Validated R²: 0.5854448644110908
Linear Regression Test R²: 0.5679236799512523


In [21]:
rf = RandomForestRegressor(random_state=42)

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(rf, rf_params, cv=tscv, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)

print("Best RF Parameters:", grid_rf.best_params_)
print("Best RF CV R²:", grid_rf.best_score_)

y_pred_rf = grid_rf.predict(X_test_scaled)
#print("Random Forest Test R²:", r2_score(y_test, y_pred_rf))
evaluate("Random Forest",y_pred_rf,y_test)


Best RF Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Best RF CV R²: 0.585103739055069
Model Name: Random Forest
R² Score: 0.5689907067543907
MAE: 28.43080819416108
RMSE: 57.068613874748166


In [22]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid_xgb = GridSearchCV(xgb, xgb_params, cv=tscv, scoring='r2', n_jobs=-1)
grid_xgb.fit(X_train_scaled, y_train)

print("Best XGB Parameters:", grid_xgb.best_params_)
print("Best XGB CV R²:", grid_xgb.best_score_)

y_pred_xgb = grid_xgb.predict(X_test_scaled)
#print("XGBoost Test R²:", r2_score(y_test, y_pred_xgb))

evaluate("XgBoost",y_pred_xgb,y_test)


Best XGB Parameters: {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}
Best XGB CV R²: 0.6117955431105627
Model Name: XgBoost
R² Score: 0.5899071761082513
MAE: 25.75774524819261
RMSE: 55.666650706342665
