In [30]:
# Department: ESTSOFT
# Class: AI Modelling
# Category: Machine learning
# Title: Electricity consumption prediction
# Contributors: 
# Last modified date: 31/03/25

### **Library**

In [31]:
# Library
# Time
from timeit import default_timer as timer
import time
from tqdm.auto import tqdm

# File
import os
import requests
import zipfile
from pathlib import Path
from PIL import Image
import random
import chardet

# Numerical & Data Handling
import numpy as np
import pandas as pd
import scipy as sp
import math

# Visualization
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline
from sklearn.tree import plot_tree
from scipy.optimize import curve_fit

# Machine Learning Libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor, 
							  ExtraTreesClassifier, ExtraTreesRegressor, 
							  BaggingClassifier, BaggingRegressor, 
							  GradientBoostingClassifier, GradientBoostingRegressor, 
							  AdaBoostClassifier, AdaBoostRegressor, 
							  VotingClassifier, VotingRegressor,
							  StackingClassifier, StackingRegressor)
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from mlxtend.frequent_patterns import apriori, association_rules
import xgboost as xgb

# Neural Network Libraries
import torch
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision
from torchvision import datasets, transforms, models
from torchvision.transforms import ToTensor
import huggingface
import keras
import tensorflow
from transformers import pipeline

# Feature Engineering
from sklearn.preprocessing import (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer, 
								   LabelEncoder, OneHotEncoder, OrdinalEncoder, LabelBinarizer)
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, RFE, SequentialFeatureSelector, VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from mlxtend.preprocessing import TransactionEncoder

# Dimensionality Reduction
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE

# Hyperparameter Tuning with Bayesian Optimization
from hyperopt import hp, tpe, fmin, Trials

# Evaluation
from sklearn.metrics import (classification_report, pairwise_distances, silhouette_score, 
							 roc_curve, auc, roc_auc_score, RocCurveDisplay, 
							 confusion_matrix, ConfusionMatrixDisplay, 
							 accuracy_score, recall_score, precision_score, f1_score,
							 log_loss, hinge_loss, mean_absolute_error, mean_squared_error)
from torchmetrics import Accuracy, Precision, Recall, F1Score, AUROC, ConfusionMatrix, MeanSquaredError, MeanAbsoluteError, R2Score, MetricCollection
import statsmodels.api as sm



### **Data Preparation**

**Data source**

**Electricity consumption**
- 840 samples in total
- Jan 2015 to Dec 2024
- 7 different locations: Seoul, Incheon, Daejeon, Daegu, Ulsan, Gwangju, Busan
- 9 Features
	- Year
	- Month
	- Province
	- Number of Households
	- Avg Power Consumption per Household (kWh)
	- Avg Electricity Bill per Household (KRW)
	- Number of Tropical Nights
	- Number of Heatwave Days
	- Number of Coldwave Days

In [32]:
# Load electricity consumption data
df_electricity = pd.read_csv('data/electricity_consumption_eng.csv', encoding='utf-8')
df_electricity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 9 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   Year                                       840 non-null    int64 
 1   Month                                      840 non-null    int64 
 2   Province                                   840 non-null    object
 3   Number of Households                       840 non-null    int64 
 4   Avg Power Consumption per Household (kWh)  840 non-null    int64 
 5   Avg Electricity Btill per Household (KRW)  840 non-null    int64 
 6   Number of Tropical Nights                  840 non-null    int64 
 7   Number of Heatwave Days                    840 non-null    int64 
 8   Number of Coldwave Days                    840 non-null    int64 
dtypes: int64(8), object(1)
memory usage: 59.2+ KB


**Weather forecast**
- 861 samples in total
- Jan 2015 to Feb 2025
- 7 different locations: Seoul, Incheon, Daejeon, Daegu, Ulsan, Gwangju, Busan
- 21 Features
	- Station Name                  
	- Year                            
	- Month                          
	- Avg Temperature (Celsius)      
	- Avg Max Temperature (Celsius) 
	- Avg Min Temperature (Celsius) 
	- Avg Local Pressure (hPa)      
	- Avg Sea Level Pressure (hPa)  
	- Avg Vapor Pressure (hPa)      
	- Avg Dew Point Temp (Celsius)  
	- Avg Relative Humidity (%)     
	- Monthly Precipitation (mm)    
	- Small Pan Evaporation (mm)    
	- Avg Wind Speed (m/s)          
	- Max Wind Speed (m/s)          
	- Avg Cloud Cover (1/10)        
	- Total Sunshine Hours (hr)     
	- Sunshine Rate (%)             
	- Total Solar Radiation (MJ/m^2)
	- Avg Min Surface Temp (Celsius)
	- Avg Ground Temp (Celsius) 

In [33]:
# Load weather forecast data
df_weather = pd.read_csv('data/weather_forecast_eng.csv', encoding='utf-8')
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 861 entries, 0 to 860
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Station Name                    861 non-null    object 
 1   Year                            861 non-null    int64  
 2   Month                           861 non-null    int64  
 3   Avg Temperature (Celsius        861 non-null    float64
 4   Avg Max Temperature (Celsius)   861 non-null    float64
 5   Avg Min Temperature (Celsius)   861 non-null    float64
 6   Avg Local Pressure (hPa)        861 non-null    float64
 7   Avg Sea Level Pressure (hPa)    861 non-null    float64
 8   Avg Vapor Pressure (hPa)        861 non-null    float64
 9   Avg Dew Point Temp (Celsius)    861 non-null    float64
 10  Avg Relative Humidity (%)       861 non-null    int64  
 11  Monthly Precipitation (mm)      861 non-null    float64
 12  Small Pan Evaporation (mm)      728 

In [34]:
# Check missing values
print(df_weather.isnull().sum())

Station Name                        0
Year                                0
Month                               0
Avg Temperature (Celsius            0
Avg Max Temperature (Celsius)       0
Avg Min Temperature (Celsius)       0
Avg Local Pressure (hPa)            0
Avg Sea Level Pressure (hPa)        0
Avg Vapor Pressure (hPa)            0
Avg Dew Point Temp (Celsius)        0
Avg Relative Humidity (%)           0
Monthly Precipitation (mm)          0
Small Pan Evaporation (mm)        133
Avg Wind Speed (m/s)                0
Max Wind Speed (m/s)                0
Avg Cloud Cover (1/10)              6
Total Sunshine Hours (hr)           0
Sunshine Rate (%)                   0
Total Solar Radiation (MJ/m^2)     97
Avg Min Surface Temp (Celsius)      0
Avg Ground Temp (Celsius)           0
dtype: int64


In [35]:
# Handle missing values w/ ffill
df_weather.ffill(inplace=True)

In [36]:
# Re-check missing values
print(df_weather.isnull().sum())

Station Name                      0
Year                              0
Month                             0
Avg Temperature (Celsius          0
Avg Max Temperature (Celsius)     0
Avg Min Temperature (Celsius)     0
Avg Local Pressure (hPa)          0
Avg Sea Level Pressure (hPa)      0
Avg Vapor Pressure (hPa)          0
Avg Dew Point Temp (Celsius)      0
Avg Relative Humidity (%)         0
Monthly Precipitation (mm)        0
Small Pan Evaporation (mm)        0
Avg Wind Speed (m/s)              0
Max Wind Speed (m/s)              0
Avg Cloud Cover (1/10)            0
Total Sunshine Hours (hr)         0
Sunshine Rate (%)                 0
Total Solar Radiation (MJ/m^2)    0
Avg Min Surface Temp (Celsius)    0
Avg Ground Temp (Celsius)         0
dtype: int64


In [37]:
# Create Date column using Year and Month columns
df_electricity['Date'] = pd.to_datetime(arg=dict(year=df_electricity['Year'], month=df_electricity['Month'], day=1))
df_weather['Date'] = pd.to_datetime(arg=dict(year=df_weather['Year'], month=df_weather['Month'], day=1))

# Drop Year and Month columns
df_electricity = df_electricity.drop(columns=['Year', 'Month'])
df_weather = df_weather.drop(columns=['Year', 'Month'])

# Rename Station Name and Province to Location
df_electricity = df_electricity.rename(columns={'Province': 'Location'})
df_weather = df_weather.rename(columns={'Station Name': 'Location'})

# Merge df_weather and df_electricity based on the date
df_merged = pd.merge(df_electricity, df_weather, on=['Date', 'Location'], how='inner')
df_merged = df_merged[['Date'] + [col for col in df_merged.columns if col != 'Date']]
df_merged.to_csv('data/merged_data.csv', index=False, encoding='utf-8')

In [39]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 26 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   Date                                       840 non-null    datetime64[ns]
 1   Location                                   840 non-null    object        
 2   Number of Households                       840 non-null    int64         
 3   Avg Power Consumption per Household (kWh)  840 non-null    int64         
 4   Avg Electricity Btill per Household (KRW)  840 non-null    int64         
 5   Number of Tropical Nights                  840 non-null    int64         
 6   Number of Heatwave Days                    840 non-null    int64         
 7   Number of Coldwave Days                    840 non-null    int64         
 8   Avg Temperature (Celsius                   840 non-null    float64       
 9   Avg Max Temperature (

### **Data Preprocessing**

### **Visualisation**