In [None]:
# Library
# Time
from timeit import default_timer as timer
import time
from tqdm.auto import tqdm

# File
import os
import requests
import zipfile
from pathlib import Path
from PIL import Image
import random
import chardet

# Numerical & Data Handling
import numpy as np
import pandas as pd
import scipy as sp
import math

# Visualization
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline
from sklearn.tree import plot_tree
from scipy.optimize import curve_fit

# Machine Learning Libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor, 
							  ExtraTreesClassifier, ExtraTreesRegressor, 
							  BaggingClassifier, BaggingRegressor, 
							  GradientBoostingClassifier, GradientBoostingRegressor, 
							  AdaBoostClassifier, AdaBoostRegressor, 
							  VotingClassifier, VotingRegressor,
							  StackingClassifier, StackingRegressor)
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from mlxtend.frequent_patterns import apriori, association_rules
from xgboost import XGBClassifier, XGBRegressor

# Neural Network Libraries
import torch
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision
from torchvision import datasets, transforms, models
from torchvision.transforms import ToTensor
import huggingface
import keras
import tensorflow
from transformers import pipeline

# Feature Engineering
from sklearn.preprocessing import (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer, 
								   LabelEncoder, OneHotEncoder, OrdinalEncoder, LabelBinarizer)
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, RFE, SequentialFeatureSelector, VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#from sklearn.impute import SimpleImputer
from mlxtend.preprocessing import TransactionEncoder
from hyperopt import hp, tpe, fmin, Trials

# Dimensionality Reduction
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE

# Time-Series Analysis
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Evaluation
from sklearn.metrics import (classification_report, pairwise_distances, silhouette_score, 
							 roc_curve, auc, roc_auc_score, RocCurveDisplay, 
							 confusion_matrix, ConfusionMatrixDisplay, 
							 accuracy_score, recall_score, precision_score, f1_score,
							 log_loss, hinge_loss, mean_absolute_error, mean_squared_error, r2_score)
from torchmetrics import Accuracy, Precision, Recall, F1Score, AUROC, ConfusionMatrix, MeanSquaredError, MeanAbsoluteError, R2Score, MetricCollection
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor


  from .autonotebook import tqdm as notebook_tqdm
  Referenced from: <EB3FF92A-5EB1-3EE8-AF8B-5923C1265422> /opt/homebrew/anaconda3/envs/venv_3.11/lib/python3.11/site-packages/torchvision/image.so
  warn(
  import pkg_resources


In [9]:
# Load data
df_merged = pd.read_csv('data/merged_data.csv')
df_merged = df_merged.drop(columns=['Date'])

In [10]:
# Data split for features and target
X = df_merged.drop(columns=['Avg Power Consumption per Household (kWh)', 'Year', 'Month', 'Location'])
y = df_merged['Avg Power Consumption per Household (kWh)']

In [11]:
# Grab column names of each data type
numerical_cols = X.select_dtypes(include=['number']).columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
datetime_cols = X.select_dtypes(include=['datetime']).columns
target_col = y.name

In [7]:
# Data split for train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Location label encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])
    
# Scaling
scaler=StandardScaler()
X_train[numerical_cols]=scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols]=scaler.transform(X_test[numerical_cols])


In [8]:
# Check multicolinearity using VIF on training data
df_vif = pd.DataFrame()
df_vif['feature'] = X_train.columns
df_vif['VIF'] = [variance_inflation_factor(X_train, i) for i in range(X_train.shape[1])]
df_vif

Unnamed: 0,feature,VIF
0,Date,9.178667
1,Year,3.853845
2,Month,2.440047
3,Location,2.512695
4,Number of Households,2.166488
5,Avg Electricity Btill per Household (KRW),1.610103
6,Number of Tropical Nights,5.334869
7,Number of Heatwave Days,3.982224
8,Number of Coldwave Days,1.45314
9,Avg Temperature (Celsius),5873.409223


In [None]:
# Drop columns w/ high VIF
X_train_vif = X_train.drop(columns=['Year', 'Month', 'Location', 'Avg Max Temperature (Celsius)', 'Avg Min Temperature (Celsius)', 'Avg Sea Level Pressure (hPa)', 'Avg Vapor Pressure (hPa)', 'Avg Dew Point Temp (Celsius)', 'Max Wind Speed (m/s)', 'Total Sunshine Hours (hr)', 'Total Solar Radiation (MJ/m^2)', 'Avg Min Surface Temp (Celsius)', 'Avg Ground Temp (Celsius)'])
X_test_vif = X_test.drop(columns=['Year', 'Month', 'Location', 'Avg Max Temperature (Celsius)', 'Avg Min Temperature (Celsius)', 'Avg Sea Level Pressure (hPa)', 'Avg Vapor Pressure (hPa)', 'Avg Dew Point Temp (Celsius)', 'Max Wind Speed (m/s)', 'Total Sunshine Hours (hr)', 'Total Solar Radiation (MJ/m^2)', 'Avg Min Surface Temp (Celsius)', 'Avg Ground Temp (Celsius)'])

# Re-check multicolinearity
df_vif_removed = pd.DataFrame()
df_vif_removed['feature'] = X_train_vif.columns
df_vif_removed['VIF'] = [variance_inflation_factor(X_train_vif, i) for i in range(X_train_vif.shape[1])]
df_vif_removed