In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

# Load sample dataset
file_path = 'Dataset.csv'
data = pd.read_csv(file_path)

In [2]:
# Split dataset into features and target
data = data.drop(columns=["Date", "Hour", "Real Estate Development Index"])
X = data.drop(columns=["Historical Demand (MW)"])
y = data["Historical Demand (MW)"]

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8761 entries, 0 to 8760
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (°C)        8761 non-null   int64  
 1   Humidity (%)            8761 non-null   int64  
 2   Wind Speed (km/h)       8761 non-null   int64  
 3   Rain (mm)               8761 non-null   float64
 4   Public Holiday          8761 non-null   object 
 5   Seasonal Factor         8761 non-null   object 
 6   Day of Week             8761 non-null   object 
 7   Historical Demand (MW)  8761 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 547.7+ KB


In [4]:
data.head()

Unnamed: 0,Temperature (°C),Humidity (%),Wind Speed (km/h),Rain (mm),Public Holiday,Seasonal Factor,Day of Week,Historical Demand (MW)
0,28,85,11,0.0,No,Winter,Monday,1124
1,21,49,5,0.0,No,Winter,Monday,1312
2,5,66,4,0.0,No,Winter,Monday,802
3,37,38,0,0.0,No,Winter,Monday,901
4,23,65,4,0.0,No,Winter,Monday,675


In [5]:
# One-hot encode categorical features
X = pd.get_dummies(X, columns=["Public Holiday", "Seasonal Factor", "Day of Week"], drop_first=True)

In [6]:
X.head()

Unnamed: 0,Temperature (°C),Humidity (%),Wind Speed (km/h),Rain (mm),Public Holiday_Yes,Seasonal Factor_Monsoon,Seasonal Factor_Summer,Seasonal Factor_Winter,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
0,28,85,11,0.0,False,False,False,True,True,False,False,False,False,False
1,21,49,5,0.0,False,False,False,True,True,False,False,False,False,False
2,5,66,4,0.0,False,False,False,True,True,False,False,False,False,False
3,37,38,0,0.0,False,False,False,True,True,False,False,False,False,False
4,23,65,4,0.0,False,False,False,True,True,False,False,False,False,False


In [7]:
# Scale numerical features
scaler = StandardScaler()
numerical_features = ["Temperature (°C)", "Humidity (%)", "Wind Speed (km/h)", "Rain (mm)"]
X[numerical_features] = scaler.fit_transform(X[numerical_features])

In [8]:
X.head()

Unnamed: 0,Temperature (°C),Humidity (%),Wind Speed (km/h),Rain (mm),Public Holiday_Yes,Seasonal Factor_Monsoon,Seasonal Factor_Summer,Seasonal Factor_Winter,Day of Week_Monday,Day of Week_Saturday,Day of Week_Sunday,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday
0,0.305151,1.497193,0.928589,-0.539278,False,False,False,True,True,False,False,False,False,False
1,-0.303909,-0.601624,-0.460687,-0.539278,False,False,False,True,True,False,False,False,False,False
2,-1.696047,0.389484,-0.692233,-0.539278,False,False,False,True,True,False,False,False,False,False
3,1.088228,-1.242929,-1.618418,-0.539278,False,False,False,True,True,False,False,False,False,False
4,-0.129892,0.331183,-0.692233,-0.539278,False,False,False,True,True,False,False,False,False,False


In [9]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = y

In [10]:
# Save the scaler and label encoder
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

In [11]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [12]:
# Initialize the model
# model = GradientBoostingClassifier(random_state=42)
# model = RandomForestRegressor(random_state=42) #Mean Squared Error (MSE): 307884.2463, Root Mean Squared Error (RMSE): 554.8732, Mean Absolute Error (MAE): 453.4872, R-squared (R²): 0.6762
model = LinearRegression()  #Mean Squared Error (MSE): 277053.0080, Root Mean Squared Error (RMSE): 526.3583, Mean Absolute Error (MAE): 434.8800, R-squared (R²): 0.7086
# model = CatBoostRegressor(random_state=42, verbose=0) #Mean Squared Error (MSE): 299312.0664, Root Mean Squared Error (RMSE): 547.0942, Mean Absolute Error (MAE): 444.1233, R-squared (R²): 0.6852


In [13]:
# Perform stratified k-fold cross-validation
cv = KFold(n_splits=2, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring="neg_mean_squared_error")
print(f"Cross-Validation Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

Cross-Validation Accuracy: -535257.9512 ± 11300.9508


In [14]:
# Fit the model
model.fit(X_train, y_train)

In [15]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.4f}")

# Calculate Root Mean Squared Error
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")

# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae:.4f}")

# Calculate R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R²): {r2:.4f}")

Mean Squared Error (MSE): 521942.5971
Root Mean Squared Error (RMSE): 722.4559
Mean Absolute Error (MAE): 595.9408
R-squared (R²): 0.7007


In [16]:
# Save the model
joblib.dump(model, "electricity_consumption_model.pkl")
print("Model saved to electricity_consumption_model.pkl")

Model saved to electricity_consumption_model.pkl
