In [1]:
# importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels

In [2]:
# load the data into dataframe
df=pd.read_csv('feature_engineered_data.csv')
print(df.head())

   current_value  lights         T1       RH_1         T2       RH_2  \
0          430.0      30  20.133333  48.000000  19.566667  44.400000   
1          250.0      30  20.260000  52.726667  19.730000  45.100000   
2          100.0      10  20.426667  55.893333  19.856667  45.833333   
3          100.0      10  20.566667  53.893333  20.033333  46.756667   
4           90.0      10  20.730000  52.660000  20.166667  47.223333   

          T3       RH_3         T4       RH_4  ...  is_weekend    nsm  lag_1  \
0  19.890000  44.900000  19.000000  46.363333  ...           0  68400  576.6   
1  19.890000  45.493333  19.000000  47.223333  ...           0  69000  430.0   
2  20.033333  47.526667  19.000000  48.696667  ...           0  69600  250.0   
3  20.100000  48.466667  19.000000  48.490000  ...           0  70200  100.0   
4  20.200000  48.530000  18.926667  48.156667  ...           0  70800  100.0   

   lag_2  lag_6  lag_12  hour_sin  hour_cos  rolling_mean_12  rolling_std_12  
0  230.

In [3]:
import pickle

# Load the selected features for 10-minute forecasting
with open("selected_features_60min.pkl", "rb") as f:
    selected_features = pickle.load(f)

print("Selected Features:", selected_features)

Selected Features: ['current_value', 'T3', 'RH_5', 'RH_8', 'T_out', 'Press_mm_hg', 'nsm', 'hour_cos', 'rolling_mean_12', 'rolling_std_12']


In [4]:
df['target_60min']= df['current_value'].shift(-6)

In [5]:
df = df.dropna().reset_index(drop=True)

In [6]:
df = df[selected_features + ['target_60min']]

In [7]:
df.head()

Unnamed: 0,current_value,T3,RH_5,RH_8,T_out,Press_mm_hg,nsm,hour_cos,rolling_mean_12,rolling_std_12,target_60min
0,430.0,19.89,55.09,48.56,6.0,734.5,68400,0.258819,115.55,153.488504,80.0
1,250.0,19.89,55.163333,48.666667,6.0,734.616667,69000,0.258819,146.383333,176.720271,140.0
2,100.0,20.033333,55.5,49.193333,6.0,734.733333,69600,0.258819,162.216667,176.788676,120.0
3,100.0,20.1,56.0425,49.2,6.0,734.85,70200,0.258819,166.383333,174.477557,190.0
4,90.0,20.2,56.49,49.633333,6.0,734.966667,70800,0.258819,170.55,172.02535,110.0


In [8]:
X=df.drop(columns=['target_60min'])
y=df['target_60min']

In [9]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [10]:
train_size = int(len(df) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

In [11]:
scaler_X = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)

#scaler_y = MinMaxScaler()
#y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)) 

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [13]:
def run_model(model_name, model ,X_train_scaled,y_train,X_test_scaled):
    model.fit(X_train_scaled,y_train)

    y_pred=model.predict(X_test_scaled)

    r2 = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse  = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print evaluation metrics
    print("Model Name:",model_name)
    print("R² Score:", r2)
    print("MAE:", mae)
    print("RMSE:", rmse)

    return

In [14]:
import xgboost
from xgboost import XGBRegressor

xg = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
run_model('XgBoost',xg ,X_train_scaled,y_train,X_test_scaled)

Model Name: XgBoost
R² Score: 0.10726052553477805
MAE: 49.90632376858235
RMSE: 82.07373416220432


In [15]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
run_model('Random Forest',rf ,X_train_scaled,y_train,X_test_scaled)

Model Name: Random Forest
R² Score: 0.10407520790830826
MAE: 49.86968407947341
RMSE: 82.22002439390138


In [16]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
run_model('Linear Regression',lr ,X_train_scaled,y_train,X_test_scaled)

Model Name: Linear Regression
R² Score: 0.21220335723041217
MAE: 41.669926115596965
RMSE: 77.09902515874069


In [17]:
def evaluate(model,y_pred,y_test):
    r2 = r2_score(y_test, y_pred)
    mae  = mean_absolute_error(y_test, y_pred)
    rmse  = np.sqrt(mean_squared_error(y_test, y_pred))

    # Print evaluation metrics
    print("Model Name:",model)
    print("R² Score:", r2)
    print("MAE:", mae)
    print("RMSE:", rmse)

In [18]:
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, TimeSeriesSplit

In [19]:
tscv = TimeSeriesSplit(n_splits=5)

cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv=tscv, scoring='r2')
print("Linear Regression Cross-Validated R²:", np.mean(cv_scores))

lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
evaluate("Linear Regression",y_pred_lr,y_test)

Linear Regression Cross-Validated R²: 0.2180890261435771
Model Name: Linear Regression
R² Score: 0.21220335723041217
MAE: 41.669926115596965
RMSE: 77.09902515874069


In [20]:
rf = RandomForestRegressor(random_state=42)

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_rf = GridSearchCV(rf, rf_params, cv=tscv, scoring='r2', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)

print("Best RF Parameters:", grid_rf.best_params_)
print("Best RF CV R²:", grid_rf.best_score_)

y_pred_rf = grid_rf.predict(X_test_scaled)
evaluate("Random Forest",y_pred_rf,y_test)


Best RF Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best RF CV R²: 0.10064035191418086
Model Name: Random Forest
R² Score: 0.12200223582215952
MAE: 49.01798010784695
RMSE: 81.39327604296474


In [21]:
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

xgb_params = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

grid_xgb = GridSearchCV(xgb, xgb_params, cv=tscv, scoring='r2', n_jobs=-1)
grid_xgb.fit(X_train_scaled, y_train)

print("Best XGB Parameters:", grid_xgb.best_params_)
print("Best XGB CV R²:", grid_xgb.best_score_)

y_pred_xgb = grid_xgb.predict(X_test_scaled)
evaluate("XgBoost",y_pred_xgb,y_test)


Best XGB Parameters: {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best XGB CV R²: 0.23271440960940767
Model Name: XgBoost
R² Score: 0.2589999452240831
MAE: 40.49571940342746
RMSE: 74.77405688576935
