In [23]:
#Library Imports
import numpy as np
import pandas as pd
import math
import os
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

In [24]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

In [25]:
#train.shape 122400 X 10
#60개의 건물 X 85일 24시간 =122400
train

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0


In [26]:
#test.shape 10080 X 9
#60개의 건물 X 7일 24시간 =10080
test

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,
...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,,,,,,,
10076,60,2020-08-31 20,,,,,,,
10077,60,2020-08-31 21,27.9,4.1,68.0,,0.0,1.0,1.0
10078,60,2020-08-31 22,,,,,,,


# **데이터 전처리**

In [27]:
#건물별로 '비전기냉방설비운영'과 '태양광보유'를 판단해 test set의 결측치를 보간해줍니다
train[['num', '비전기냉방설비운영','태양광보유']]
ice={}
hot={}
count=0
for i in range(0, len(train), len(train)//60):
    count +=1
    ice[count]=train.loc[i,'비전기냉방설비운영']
    hot[count]=train.loc[i,'태양광보유']
    

In [28]:
for i in range(len(test)):
    test.loc[i, '비전기냉방설비운영']=ice[test['num'][i]]
    test.loc[i, '태양광보유']=hot[test['num'][i]]

In [29]:
#시간 변수와 요일 변수를 추가해봅니다.
def time(x):
    return int(x[-2:])
train['time']=train['date_time'].apply(lambda x: time(x))
test['time']=test['date_time'].apply(lambda x: time(x))

def weekday(x):
    return pd.to_datetime(x[:10]).weekday()
train['weekday']=train['date_time'].apply(lambda x :weekday(x))
test['weekday']=test['date_time'].apply(lambda x :weekday(x))

+ test 결측치 보간해줍니다.
+ test 데이터의 변수는 예보 데이터이며, 예보 데이터는 train 데이터의 기간에 생성된 것이기에 활용 가능합니다.

In [30]:
test.interpolate(method='values')

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,time,weekday
0,1,2020-08-25 00,27.800000,1.500000,74.000000,0.0,0.000000,0.0,0.0,0,1
1,1,2020-08-25 01,27.633333,1.366667,75.333333,0.0,0.000000,0.0,0.0,1,1
2,1,2020-08-25 02,27.466667,1.233333,76.666667,0.0,0.000000,0.0,0.0,2,1
3,1,2020-08-25 03,27.300000,1.100000,78.000000,0.0,0.000000,0.0,0.0,3,1
4,1,2020-08-25 04,26.900000,1.166667,79.666667,0.0,0.000000,0.0,0.0,4,1
...,...,...,...,...,...,...,...,...,...,...,...
10075,60,2020-08-31 19,28.633333,3.566667,66.000000,0.0,0.533333,1.0,1.0,19,0
10076,60,2020-08-31 20,28.266667,3.833333,67.000000,0.0,0.266667,1.0,1.0,20,0
10077,60,2020-08-31 21,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,21,0
10078,60,2020-08-31 22,27.900000,4.100000,68.000000,0.0,0.000000,1.0,1.0,22,0


# **모델링**

In [31]:
train_x=train.drop('전력사용량(kWh)', axis=1)
train_y=train[['전력사용량(kWh)']]

In [32]:
train_x.drop('date_time', axis=1, inplace=True)
test.drop('date_time', axis=1, inplace=True)

In [33]:
cross=KFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in cross.split(train_x, train_y):
    folds.append((train_idx, valid_idx))

아래 코드는 최정명님의 코드 구성 방식을 살며시 활용했습니다. 감사합니다

https://www.dacon.io/competitions/official/235713/codeshare/2476?page=1&dtype=recent


In [34]:
models={}
for fold in range(5):
    print(f'===================={fold+1}=======================')
    train_idx, valid_idx=folds[fold]
    X_train=train_x.iloc[train_idx, :]
    y_train=train_y.iloc[train_idx, :]
    X_valid=train_x.iloc[valid_idx, :]
    y_valid=train_y.iloc[valid_idx, :]
    
    model=LGBMRegressor(n_estimators=100)
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
             early_stopping_rounds=30, verbose=100)
    models[fold]=model
    
    print(f'================================================\n\n')

Training until validation scores don't improve for 30 rounds
[100]	training's l2: 110589	valid_1's l2: 110225
Did not meet early stopping. Best iteration is:
[100]	training's l2: 110589	valid_1's l2: 110225


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 105803	valid_1's l2: 118195
Did not meet early stopping. Best iteration is:
[100]	training's l2: 105803	valid_1's l2: 118195


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 110710	valid_1's l2: 110163
Did not meet early stopping. Best iteration is:
[100]	training's l2: 110710	valid_1's l2: 110163


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 110637	valid_1's l2: 112547
Did not meet early stopping. Best iteration is:
[100]	training's l2: 110637	valid_1's l2: 112547


Training until validation scores don't improve for 30 rounds
[100]	training's l2: 107505	valid_1's l2: 115813
Did not meet early stopping. Best iteration is:
[100]	

In [35]:
for i in range(5):
    submission['answer'] += models[i].predict(test)/5 

In [36]:
submission

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,8498.811895
1,1 2020-08-25 01,8250.686717
2,1 2020-08-25 02,8233.221080
3,1 2020-08-25 03,8475.304267
4,1 2020-08-25 04,8229.168616
...,...,...
10075,60 2020-08-31 19,3132.833178
10076,60 2020-08-31 20,2717.140679
10077,60 2020-08-31 21,2987.995728
10078,60 2020-08-31 22,2528.727359


In [37]:
#제출
submission.to_csv('baseline_submission3.csv', index=False)