In [185]:
# Library
# Time
from timeit import default_timer as timer
import time
from tqdm.auto import tqdm

# File
import os
import requests
import zipfile
from pathlib import Path
from PIL import Image
import random
import chardet

# Numerical & Data Handling
import numpy as np
import pandas as pd
import scipy as sp
import math

# Visualization
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
%matplotlib inline
from sklearn.tree import plot_tree
from scipy.optimize import curve_fit

# Machine Learning Libraries
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor, 
							  ExtraTreesClassifier, ExtraTreesRegressor, 
							  BaggingClassifier, BaggingRegressor, 
							  GradientBoostingClassifier, GradientBoostingRegressor, 
							  AdaBoostClassifier, AdaBoostRegressor, 
							  VotingClassifier, VotingRegressor,
							  StackingClassifier, StackingRegressor)
from sklearn.linear_model import LinearRegression, LogisticRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC, SVR
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from mlxtend.frequent_patterns import apriori, association_rules
from xgboost import XGBClassifier, XGBRegressor

# Neural Network Libraries
import torch
from torch import nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision
from torchvision import datasets, transforms, models
from torchvision.transforms import ToTensor
import huggingface
import keras
import tensorflow
from transformers import pipeline

# Feature Engineering
from sklearn.preprocessing import (StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler, Normalizer, 
								   LabelEncoder, OneHotEncoder, OrdinalEncoder, LabelBinarizer)
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif, RFE, SequentialFeatureSelector, VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#from sklearn.impute import SimpleImputer
from mlxtend.preprocessing import TransactionEncoder
from hyperopt import hp, tpe, fmin, Trials

# Dimensionality Reduction
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE

# Time-Series Analysis
from statsmodels.tsa.seasonal import STL, seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Evaluation
from sklearn.metrics import (classification_report, pairwise_distances, silhouette_score, 
							 roc_curve, auc, roc_auc_score, RocCurveDisplay, 
							 confusion_matrix, ConfusionMatrixDisplay, 
							 accuracy_score, recall_score, precision_score, f1_score,
							 log_loss, hinge_loss, mean_absolute_error, mean_squared_error, r2_score)
from torchmetrics import Accuracy, Precision, Recall, F1Score, AUROC, ConfusionMatrix, MeanSquaredError, MeanAbsoluteError, R2Score, MetricCollection
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [190]:
# Data split for train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Location label encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    X_train[col] = label_encoder.fit_transform(X_train[col])
    X_test[col] = label_encoder.transform(X_test[col])
    
# Scaling
scaler=StandardScaler()
X_train[numerical_cols]=scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols]=scaler.transform(X_test[numerical_cols])


In [192]:
#Drop columns w/ high VIF
X_train_vif = X_train.drop(columns=['Avg Temperature (Celsius)', 'Avg Dew Point Temp (Celsius)', 'Avg Min Surface Temp (Celsius)'])
X_test_vif = X_test.drop(columns=['Avg Temperature (Celsius)', 'Avg Dew Point Temp (Celsius)', 'Avg Min Surface Temp (Celsius)'])

# Re-check multicolinearity
df_vif_removed = pd.DataFrame()
df_vif_removed['feature'] = X_train_vif.columns
df_vif_removed['VIF'] = [variance_inflation_factor(X_train_vif, i) for i in range(X_train_vif.shape[1])]
df_vif_removed

Unnamed: 0,feature,VIF
0,Number of Households,1.839737
1,Avg Electricity Btill per Household (KRW),1.440778
2,Number of Tropical Nights,5.066624
3,Number of Heatwave Days,3.670118
4,Number of Coldwave Days,1.42493
5,Avg Max Temperature (Celsius),139.316919
6,Avg Min Temperature (Celsius),164.700927
7,Avg Local Pressure (hPa),42.144325
8,Avg Sea Level Pressure (hPa),63.336904
9,Avg Vapor Pressure (hPa),61.816338


In [198]:
# Elastic net
# Hyperparameter tuning
start_time = time.time()
param_grid_elastic_net = {'alpha': [0.1, 1, 10],
						  'l1_ratio': [0.1, 0.5, 0.9]}
grid_elastic_net = GridSearchCV(estimator=ElasticNet(), 
								param_grid=param_grid_elastic_net, 
								cv=5)
grid_elastic_net.fit(X_train_pca, y_train)
end_time = time.time()
print(f"Training time: {end_time - start_time: .4f}")

# Best model
elastic_net_best_alpha = grid_elastic_net.best_params_['alpha']
elastic_net_best_l1_ratio = grid_elastic_net.best_params_['l1_ratio']
elastic_net_best_model = grid_elastic_net.best_estimator_
print(f"Best alpha: {elastic_net_best_alpha} | Best l1_ratio: {elastic_net_best_l1_ratio}")

# Evaluation
y_pred_elastic_net = elastic_net_best_model.predict(X_test_pca)
rmse_elastic_net = math.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred_elastic_net))
mae_elastic_net = mean_absolute_error(y_true=y_test, y_pred=y_pred_elastic_net)
r2_elastic_net = r2_score(y_true=y_test, y_pred=y_pred_elastic_net)
adj_r2_elastic_net = adj_r2_score(y_true=y_test, y_pred=y_pred_elastic_net)
print(f"RMSE of elastic net model: {rmse_elastic_net}")
print(f"MAE of elastic net model: {mae_elastic_net}")
print(f"R^2 of elastic net model: {r2_elastic_net}")
print(f"Adjusted R^2 of elastic net model: {adj_r2_elastic_net}")


Training time:  0.1942
Best alpha: 0.1 | Best l1_ratio: 0.9
RMSE of elastic net model: 22.717476397711106
MAE of elastic net model: 16.211542238188503
R^2 of elastic net model: 0.3262134490500179
Adjusted R^2 of elastic net model: 0.21859476382884013
