In [1]:
# PowerShell 관리자 모드 : Get-ExecutionPolicy실행 => Restricted
#                         Set-ExecutionPolicy RemoteSigned 
# app.py 생성 후 ctrl+j 터미널 창을 열기
# 가상환경 만들기 : python -m venv .venv
# 가상환경 들어가기 : .venv\Scripts\activate
# python -m pip install --upgrade pip
# pip install flask

# pip freeze > requirements.txt
# pip install -r requirements.txt(내일)
# ctrl+shift+p -> 인터프리터선택 -> .venv 가상환경 선택

# 전력 사용량 예측을 위한 머신러닝 하이퍼파라미터 튜닝 가이드

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# 데이터셋
df = pd.read_csv(r'C:\1st_Projekt\data\훈련데이터셋.csv')
df

Unnamed: 0,지역코드,최저기온(°C),3.0m 지중온도(°C),평균 현지기압(hPa),가조시간(hr),평균 상대습도(%),풍정합(100m),합계 소형증발량(mm),파워
0,1111010100,-5.2,18.4,1020.9,9.7,38.9,2386.0,1.7,146294.6135
1,1111010100,-7.0,18.4,1018.7,9.7,41.9,2042.0,2.1,175633.8270
2,1111010100,-3.9,18.3,1012.2,9.7,62.8,2104.0,1.9,156084.1910
3,1111010100,0.1,18.2,1015.0,9.7,57.5,1248.0,2.2,177018.8420
4,1111010100,0.9,18.1,1011.7,9.7,60.8,1689.0,1.8,158467.1690
...,...,...,...,...,...,...,...,...,...
181403,1174011000,-11.2,13.6,1015.1,10.1,52.0,2815.0,1.7,115124.5470
181404,1174011000,-9.7,13.5,1017.5,10.1,47.8,1777.0,2.0,99907.5000
181405,1174011000,-6.9,13.5,1017.9,10.1,62.5,1609.0,1.9,94818.5120
181406,1174011000,-4.6,13.4,1016.1,10.2,70.0,1597.0,2.1,86792.8280


In [4]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.shape, y.shape

((181408, 8), (181408,))

In [5]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y,
                                                   test_size=0.3,
                                                   )
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((126985, 8), (126985,), (54423, 8), (54423,))

In [6]:
town_code = train_X['지역코드'].to_numpy().reshape(-1, 1)
town_code.shape

(126985, 1)

In [7]:
from sklearn.preprocessing import OneHotEncoder
code_onehot = OneHotEncoder().fit_transform(town_code)
code_onehot


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 126985 stored elements and shape (126985, 466)>

In [8]:
from sklearn.compose import ColumnTransformer

In [9]:
from sklearn.preprocessing import StandardScaler

In [10]:
preprocessor = ColumnTransformer(
    transformers=[('onehot', OneHotEncoder(), ['지역코드']), 
                ('standardScaler', StandardScaler(), [col for col in X.columns if col!='지역코드'])]
    )
X_train_preprocessed = preprocessor.fit_transform(train_X)

In [11]:
X_train_preprocessed

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1015880 stored elements and shape (126985, 473)>

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
from sklearn.svm import SVC

In [15]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    n_jobs=-1
)

In [16]:
model.fit(X_train_preprocessed, train_y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2182
[LightGBM] [Info] Number of data points in the train set: 126985, number of used features: 473
[LightGBM] [Info] Start training from score 137339.105812


In [17]:
X_test_preprocessed = preprocessor.fit_transform(test_X)

In [18]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def model_measure_reg(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred = model.predict(test_X)

    mae  = mean_absolute_error(test_y, pred)
    rmse = np.sqrt(mean_squared_error(test_y, pred))  # deprecated 대응
    r2   = r2_score(test_y, pred)

    print(f"MAE : {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²  : {r2:.4f}")

    return {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }

model_measure_reg(model, X_train_preprocessed, train_y, X_test_preprocessed, test_y)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002872 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2182
[LightGBM] [Info] Number of data points in the train set: 126985, number of used features: 473
[LightGBM] [Info] Start training from score 137339.105812
MAE : 57712.06
RMSE: 174009.51
R²  : 0.7471


{'MAE': 57712.05929888537, 'RMSE': 174009.5137069502, 'R2': 0.7470950702735384}

In [19]:
%pip install xgboost
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.5,
    random_state=42,
    n_jobs=-1
)

Note: you may need to restart the kernel to use updated packages.


In [20]:
xgb_model.fit(X_train_preprocessed, train_y)

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def model_measure_xgb(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred = model.predict(test_X)

    mae  = mean_absolute_error(test_y, pred)
    rmse = np.sqrt(mean_squared_error(test_y, pred))
    r2   = r2_score(test_y, pred)

    print(f"MAE : {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²  : {r2:.4f}")

    return {
        "MAE": mae,
        "RMSE": rmse,
        "R2": r2
    }

model_measure_xgb(xgb_model, X_train_preprocessed, train_y, X_test_preprocessed, test_y)

MAE : 57739.23
RMSE: 172694.73
R²  : 0.7509


{'MAE': 57739.23246201096,
 'RMSE': 172694.72632921624,
 'R2': 0.7509024481957346}