# 데이터 전처리

In [3]:
#Library Imports
import numpy as np  # 넘파이
import pandas as pd  # 판다스
# import math
import os  # 디렉토리 변경
from sklearn.model_selection import GridSearchCV  # 파라미터 설정 고민을 줄여주는 고마운 친구
from sklearn.metrics import make_scorer  # loss function 커스터마이징

from lightgbm import LGBMRegressor  # 전 LGBMRegressor 만을 사용해서 돌려보았습니다.
from sklearn.model_selection import train_test_split  # train, valid set 제작

os.chdir('C:\\Users\\user\\Desktop\\energy') 

In [4]:
# 데이터 로드 (인코딩은 euc-kr)
train=pd.read_csv('train.csv', encoding='euc-kr')
test=pd.read_csv('test.csv', encoding='euc-kr')
submission=pd.read_csv('sample_submission.csv', encoding='euc-kr')

In [5]:
train.head(7)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
5,1,2020-06-01 05,8010.576,16.9,3.4,93.0,0.0,0.0,0.0,0.0
6,1,2020-06-01 06,7978.176,16.7,3.4,90.0,0.1,0.0,0.0,0.0


In [6]:
test.head(7)  # 전력사용량 없음

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,
5,1,2020-08-25 05,,,,,,,
6,1,2020-08-25 06,26.1,1.3,83.0,0.0,0.0,,


In [7]:
#건물별로 '비전기냉방설비운영'과 '태양광보유'를 판단해 test set의 결측치를 보간해줍니다
train[['num', '비전기냉방설비운영','태양광보유']]
ice={}
hot={}
count=0
for i in range(0, len(train), len(train)//60):
    count +=1
    ice[count]=train.loc[i,'비전기냉방설비운영']
    hot[count]=train.loc[i,'태양광보유']

for i in range(len(test)):
    test.loc[i, '비전기냉방설비운영']=ice[test['num'][i]]
    test.loc[i, '태양광보유']=hot[test['num'][i]]

In [8]:
# 시간, 요일, 주말여부(new!) 추가
def time(x):
    return int(x[-2:])
train['time']=train['date_time'].apply(lambda x: time(x))
test['time']=test['date_time'].apply(lambda x: time(x))

# 평일=0~4, 주말=5~6
def weekday(x):
    return pd.to_datetime(x[:10]).weekday()
train['weekday']=train['date_time'].apply(lambda x :weekday(x))
test['weekday']=test['date_time'].apply(lambda x :weekday(x))

# 평일=0, 주말=1
train['weekend']=train['weekday'].apply(lambda x: 0 if x < 4 else 1)
test['weekend']=test['weekday'].apply(lambda x: 0 if x < 4 else 1)

In [9]:
# 기온, 풍속, 습도 등, 기타 결측치를 적당히 1/3, 2/3 수치로 보간해줍니다.
test = test.interpolate(method='values')  
test.head(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,time,weekday,weekend
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,0,1,0
1,1,2020-08-25 01,27.633333,1.366667,75.333333,0.0,0.0,0.0,0.0,1,1,0
2,1,2020-08-25 02,27.466667,1.233333,76.666667,0.0,0.0,0.0,0.0,2,1,0


In [10]:
train.head(3)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,time,weekday,weekend
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,0,0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,0,0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,0,0


# 모델링

In [11]:
# 학습용set 생성
train.drop('date_time', axis=1, inplace=True)  # 학습에 불필요한 날짜 제거
train_x=train.drop('전력사용량(kWh)', axis=1)  # 문제
train_y=train[['전력사용량(kWh)']]  # 정답

X_train, X_valid, y_train, y_valid = train_test_split(train_x, train_y, test_size=0.2, random_state=156)

In [12]:
X_train.head(1)

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,time,weekday,weekend
91868,46,21.0,5.2,80.0,0.0,0.0,1.0,0.0,20,2,0


In [13]:
y_train.head(1)

Unnamed: 0,전력사용량(kWh)
91868,1609.632


In [14]:
# loss function : SMAPE 정의
# from sklearn.metrics import mean_absolute_error
def smape(true, pred):
    true = np.array(true)  # np.array로 바꿔야 에러 없음
    pred = np.array(pred)
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred)))  # *2 , *100은 상수이므로 생략
SMAPE = make_scorer(smape, greater_is_better=False)  # smape 값이 작아져야하므로 False

In [15]:
# 파라미터 설정, 모델생성 함수
def get_best_params(model, params):
    grid_model = GridSearchCV(
        model,
        param_grid = params,  # 파라미터
        cv=5,  # Kfold : 5
        scoring= SMAPE)  #loss function

    grid_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=100)
    scr = grid_model.best_score_
    print(f'{model.__class__.__name__} 최적 score 값 {scr}')
    return grid_model.best_estimator_

In [16]:
# 파라미터 후보군 설정
# 어떤 파라미터로 하는게 좋을지 고민된다면 고민하는 것들을 리스트 안에 다 넣어보세요 알아서 골라줄겁니다.
# 저는 예시로 learning_rate만 0.1 or 0.01 중 더 좋은걸 골라달라고 했습니다.
params = {}
params['boosting_type'] = ['gbdt']
params['objective'] = ['regression']
params['n_estimators'] = [100]
params['learning_rate'] = [0.1, 0.01]  
params['subsample'] = [1]

In [17]:
# 모델정의
model=LGBMRegressor(params)

In [18]:
# 학습진행
best_lgbm = get_best_params(model, params)
best_lgbm  # learning_rate 0.1 or 0.01 중, 0.01이 더 좋았다고 하는군요. 

[100]	valid_0's l2: 110307	valid_1's l2: 119800
[100]	valid_0's l2: 108222	valid_1's l2: 115531
[100]	valid_0's l2: 109730	valid_1's l2: 117630
[100]	valid_0's l2: 107259	valid_1's l2: 115403
[100]	valid_0's l2: 108007	valid_1's l2: 116569
[100]	valid_0's l2: 1.14647e+06	valid_1's l2: 1.18634e+06
[100]	valid_0's l2: 1.13986e+06	valid_1's l2: 1.1809e+06
[100]	valid_0's l2: 1.14542e+06	valid_1's l2: 1.18557e+06
[100]	valid_0's l2: 1.1387e+06	valid_1's l2: 1.1782e+06
[100]	valid_0's l2: 1.13814e+06	valid_1's l2: 1.17912e+06
[100]	training's l2: 1.13759e+06	valid_1's l2: 1.17786e+06
LGBMRegressor 최적 score 값 -0.30167197091483605


LGBMRegressor(learning_rate=0.01, objective='regression', subsample=1)

In [19]:
# 기타 지표로 에러 측정
from sklearn.metrics import mean_squared_error, r2_score
y_pred = best_lgbm.predict(X_train)

mse_score = mean_squared_error(y_train, y_pred)
r2_score = r2_score(y_train, y_pred)
print('MSE:', mse_score)
print('R2 :', r2_score)

MSE: 1137593.8469961581
R2 : 0.7285708061170786


In [20]:
# 모델저장, 로드
import sklearn.externals 
import joblib
joblib.dump(best_lgbm, 'best_lgbm.pkl')
load_lgbm = joblib.load('best_lgbm.pkl')

# 제출용 데이터

In [21]:
# 모델에 넣기 위해 날짜칼럼 제거
test_x = test.drop('date_time', axis=1)

In [22]:
# 모델 예측
submission_y = best_lgbm.predict(test_x)

In [23]:
# submission.csv 생성
test['answer'] = submission_y
test['num_date_time'] = test.apply(lambda x: str(x['num']) +' '+ x['date_time'], axis=1)
submission = test[['num_date_time', 'answer']]
submission.to_csv('submission.csv', index=False)
submission

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,6264.865912
1,1 2020-08-25 01,6264.865912
2,1 2020-08-25 02,6264.865912
3,1 2020-08-25 03,6264.865912
4,1 2020-08-25 04,6264.865912
...,...,...
10075,60 2020-08-31 19,2809.636990
10076,60 2020-08-31 20,2392.383243
10077,60 2020-08-31 21,2343.381185
10078,60 2020-08-31 22,2343.381185


In [24]:
%pip install shap
%pip install ngboost

Collecting shap
  Downloading shap-0.39.0-cp38-cp38-win_amd64.whl (414 kB)
Collecting slicer==0.0.7
  Downloading slicer-0.0.7-py3-none-any.whl (14 kB)
Installing collected packages: slicer, shap
Successfully installed shap-0.39.0 slicer-0.0.7
Note: you may need to restart the kernel to use updated packages.
Collecting scikit-learn<0.24,>=0.21
  Downloading scikit_learn-0.23.2-cp38-cp38-win_amd64.whl (6.8 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not install packages due to an OSError: [WinError 5] 액세스가 거부되었습니다: 'C:\\Users\\user\\AppData\\Local\\Temp\\pip-uninstall-48i4y6x7\\_check_build.cp38-win_amd64.pyd'
Consider using the `--user` option or check the permissions.



In [25]:
import os
import tempfile
import matplotlib as mpl
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

import sklearn
from sklearn.metrics import confusion_matrix, roc_auc_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

import shap

import xgboost as xgb
from ngboost import NGBRegressor

from fbprophet import Prophet

import imageio
from datetime import datetime

import seaborn as sns
from statsmodels.graphics.tsaplots import plot_acf

import warnings
warnings.filterwarnings('ignore')

ImportError: cannot import name '_raise_dep_warning_if_not_pytest' from 'sklearn.utils.deprecation' (C:\Users\user\anaconda3\lib\site-packages\sklearn\utils\deprecation.py)