LightGBM은 전력수요 데이터의 비선형성, 변수 간 상호작용,
시간대 및 극단값 패턴을 별도의 가정 없이 학습할 수 있어
전력 사용량 회귀 예측에 특히 적합

In [3]:
import pandas as pd
import numpy as np

In [5]:
# 데이터셋
df = pd.read_csv(r'C:\1st_Projekt\data\훈련데이터셋.csv')
df

Unnamed: 0,지역코드,최저기온(°C),3.0m 지중온도(°C),평균 현지기압(hPa),가조시간(hr),평균 상대습도(%),풍정합(100m),합계 소형증발량(mm),파워
0,1111010100,-5.2,18.4,1020.9,9.7,38.9,2386.0,1.7,146294.6135
1,1111010100,-7.0,18.4,1018.7,9.7,41.9,2042.0,2.1,175633.8270
2,1111010100,-3.9,18.3,1012.2,9.7,62.8,2104.0,1.9,156084.1910
3,1111010100,0.1,18.2,1015.0,9.7,57.5,1248.0,2.2,177018.8420
4,1111010100,0.9,18.1,1011.7,9.7,60.8,1689.0,1.8,158467.1690
...,...,...,...,...,...,...,...,...,...
181403,1174011000,-11.2,13.6,1015.1,10.1,52.0,2815.0,1.7,115124.5470
181404,1174011000,-9.7,13.5,1017.5,10.1,47.8,1777.0,2.0,99907.5000
181405,1174011000,-6.9,13.5,1017.9,10.1,62.5,1609.0,1.9,94818.5120
181406,1174011000,-4.6,13.4,1016.1,10.2,70.0,1597.0,2.1,86792.8280


In [6]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X.shape, y.shape

((181408, 8), (181408,))

In [7]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y,
                                                   test_size=0.3,
                                                   )
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((126985, 8), (126985,), (54423, 8), (54423,))

In [23]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def model_measure_reg(model, train_X, train_y, test_X, test_y):
    model.fit(train_X, train_y)
    pred = model.predict(test_X)
    
    mae  = mean_absolute_error(test_y, pred)
    rmse = mean_squared_error(test_y, pred)
    r2   = r2_score(test_y, pred)
    
    print(f"MAE:{mae:.2f}, RMSE:{rmse:.2f}, R2:{r2:.4f}")

In [30]:
model = LGBMRegressor(
    n_estimators=500,      # 트리 개수 (많을수록 복잡한 패턴 학습)
    learning_rate=0.05,    # 작게 + 트리 많이 = 안정적
    max_depth=-1,          # 제한 없음 (LightGBM 장점)
    num_leaves=31,         # 리프 수 (복잡도 핵심 파라미터)
    
    subsample=0.8,         # 데이터 샘플링 (과적합 방지)
    colsample_bytree=0.8,  # 변수 샘플링
    n_jobs=-1
)

In [34]:
model = LGBMRegressor(verbose=-1)
model_measure_reg(model, train_X, train_y, test_X, test_y)

MAE:72188.38, RMSE:68822139458.45, R2:0.4007


In [38]:
from sklearn.ensemble import VotingRegressor
voting_model = VotingRegressor(estimators=[('lgb', model)]) # 다수결
voting_model.fit(train_X, train_y)

In [41]:
voting_model.named_estimators_

{'lgb': LGBMRegressor(verbose=-1)}

In [47]:
y_pred = voting_model.predict(test_X)

In [50]:
import matplotlib.pyplot as plt
# 한글 설정
plt.rc('font', family='Malgun Gothic') # Windows에서 시각화 시 한글 깨짐 방지
plt.rc('axes', unicode_minus=False) # 축의 -기호 깨짐 방지

In [52]:
%pip install statsmodels==0.13.5
import statsmodels.api as sm

Collecting statsmodels==0.13.5
  Using cached statsmodels-0.13.5-cp310-cp310-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.2 (from statsmodels==0.13.5)
  Using cached patsy-1.0.2-py2.py3-none-any.whl.metadata (3.6 kB)
Using cached statsmodels-0.13.5-cp310-cp310-win_amd64.whl (9.1 MB)
Using cached patsy-1.0.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels

   ---------------------------------------- 0/2 [patsy]
   ---------------------------------------- 0/2 [patsy]
   ---------------------------------------- 0/2 [patsy]
   -------------------- ------------------- 1/2 [statsmodels]
   -------------------- ------------------- 1/2 [statsmodels]
   -------------------- ------------------- 1/2 [statsmodels]
   -------------------- ------------------- 1/2 [statsmodels]
   -------------------- ------------------- 1/2 [statsmodels]
   -------------------- ------------------- 1/2 [statsmodels]
   -------------------- ------------------- 1/2 [statsmodels]

In [54]:
X = train_X
y = train_y

X = sm.add_constant(X)   # 절편 추가 (중요)
model = sm.OLS(y, X)
result = model.fit()

print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                     파워   R-squared:                       0.051
Model:                            OLS   Adj. R-squared:                  0.050
Method:                 Least Squares   F-statistic:                     844.4
Date:                Tue, 23 Dec 2025   Prob (F-statistic):               0.00
Time:                        17:42:12   Log-Likelihood:            -1.7957e+06
No. Observations:              126985   AIC:                         3.591e+06
Df Residuals:                  126976   BIC:                         3.591e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const         -3.893e+06    2.6e+05    -14.997

In [58]:
model.predict(test_X, test_y)

array([8.58384046e+18, 6.77001828e+10, 1.31567404e+11, 7.54312955e+12,
       8.83756026e+10, 5.04507821e+11, 1.48915199e+13, 2.60747344e+10])