# 폰트 설정 및 라이브러리 설치

In [None]:
#@title
# 단계 1: 폰트 설치
import matplotlib.font_manager as fm

!apt-get -qq -y install fonts-nanum > /dev/null
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
fm._rebuild() 

In [1]:
#Library Imports
import numpy as np  # 넘파이
import pandas as pd  # 판다스
# import math # 단계 2: 런타임 재시작
import os  # 디렉토리 변경
from sklearn.model_selection import GridSearchCV, KFold, train_test_split  # 파라미터 설정 고민을 줄여주는 고마운 친구
from sklearn.metrics import make_scorer, mean_squared_error # loss function 커스터마이징
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMRegressor

In [None]:
#@title
# 단계 3: 한글 폰트 설정
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.font_manager as fm

# 마이너스 표시 문제
mpl.rcParams['axes.unicode_minus'] = False
	
# 한글 폰트 설정
path = '/usr/share/fonts/truetype/nanum/NanumGothicBold.ttf'
font_name = fm.FontProperties(fname=path, size=18).get_name()
plt.rc('font', family=font_name)
fm._rebuild() 

# 데이터 Load 및 결측값 확인

In [1]:
#train.csv 가져오기
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [2]:
#test.csv 가져오기
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [3]:
#sample_submission.csv 가져오기
from google.colab import files
uploaded = files.upload()

Saving sample_submission.csv to sample_submission.csv


In [2]:
# 데이터 로드 (인코딩은 euc-kr) encoding='CP949'
train=pd.read_csv('train.csv', encoding='euc-kr')
test=pd.read_csv('test.csv', encoding='euc-kr')
submission=pd.read_csv('sample_submission.csv', encoding='euc-kr')

In [3]:
train.head(7)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0
5,1,2020-06-01 05,8010.576,16.9,3.4,93.0,0.0,0.0,0.0,0.0
6,1,2020-06-01 06,7978.176,16.7,3.4,90.0,0.1,0.0,0.0,0.0


In [4]:
test.head(7)  # 전력사용량 없음

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,
5,1,2020-08-25 05,,,,,,,
6,1,2020-08-25 06,26.1,1.3,83.0,0.0,0.0,,


In [5]:
num_to_kind = {
    1:3,
    2:4,
    3:3,
    4:1,
    5:5,
    6:2,
    7:2,
    8:2,
    9:3,
    10:1,
    11:1,
    12:1,
    13:2,
    14:4,
    15:4,
    16:5,
    17:2,
    18:2,
    19:5,
    20:5,
    21:5,
    22:4,
    23:4,
    24:5,
    25:2,
    26:2,
    27:2,
    28:5,
    29:4,
    30:5,
    31:3,
    32:3,
    33:3,
    34:1,
    35:2,
    36:5,
    37:4,
    38:4,
    39:4,
    40:1,
    41:1,
    42:1,
    43:4,
    44:4,
    45:4,
    46:2,
    47:2,
    48:2,
    49:5,
    50:5,
    51:5,
    52:4,
    53:2,
    54:4,
    55:2,
    56:2,
    57:2,
    58:4,
    59:5,
    60:5,
}

In [6]:
train["build_kind"] = train['num'].apply(lambda x: num_to_kind[x])
test["build_kind"] = test['num'].apply(lambda x: num_to_kind[x])

In [7]:
train

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3
...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0,5
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0,5
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0,5
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0,5


In [8]:
submission.head(7)

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,0
1,1 2020-08-25 01,0
2,1 2020-08-25 02,0
3,1 2020-08-25 03,0
4,1 2020-08-25 04,0
5,1 2020-08-25 05,0
6,1 2020-08-25 06,0


In [9]:
print('train 결측값 확인\n', train.isnull().sum())
print('\ntest 결측값 확인\n', test.isnull().sum())

train 결측값 확인
 num           0
date_time     0
전력사용량(kWh)    0
기온(°C)        0
풍속(m/s)       0
습도(%)         0
강수량(mm)       0
일조(hr)        0
비전기냉방설비운영     0
태양광보유         0
build_kind    0
dtype: int64

test 결측값 확인
 num                0
date_time          0
기온(°C)          6720
풍속(m/s)         6720
습도(%)           6720
강수량(mm, 6시간)    8400
일조(hr, 3시간)     6720
비전기냉방설비운영       7784
태양광보유           8456
build_kind         0
dtype: int64


# 결측값 대체

## 비전기냉방설비운영 및 태양광 보유

In [10]:
#건물별로 '비전기냉방설비운영'과 '태양광보유'를 판단해 test set의 결측치를 보간해줍니다
train[['num', '비전기냉방설비운영','태양광보유']]
ice={}
hot={}
count=0
for i in range(0, len(train), len(train)//60):
    count +=1
    ice[count]=train.loc[i,'비전기냉방설비운영']
    hot[count]=train.loc[i,'태양광보유']

for i in range(len(test)):
    test.loc[i, '비전기냉방설비운영']=ice[test['num'][i]]
    test.loc[i, '태양광보유']=hot[test['num'][i]]

In [11]:
train.head(7)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3
5,1,2020-06-01 05,8010.576,16.9,3.4,93.0,0.0,0.0,0.0,0.0,3
6,1,2020-06-01 06,7978.176,16.7,3.4,90.0,0.1,0.0,0.0,0.0,3


In [12]:
test.head(7)  # 전력사용량 없음

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,build_kind
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,3
1,1,2020-08-25 01,,,,,,0.0,0.0,3
2,1,2020-08-25 02,,,,,,0.0,0.0,3
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,0.0,0.0,3
4,1,2020-08-25 04,,,,,,0.0,0.0,3
5,1,2020-08-25 05,,,,,,0.0,0.0,3
6,1,2020-08-25 06,26.1,1.3,83.0,0.0,0.0,0.0,0.0,3


In [13]:
print('train 결측값 확인\n', train.isnull().sum())
print('\ntest 결측값 확인\n', test.isnull().sum())

train 결측값 확인
 num           0
date_time     0
전력사용량(kWh)    0
기온(°C)        0
풍속(m/s)       0
습도(%)         0
강수량(mm)       0
일조(hr)        0
비전기냉방설비운영     0
태양광보유         0
build_kind    0
dtype: int64

test 결측값 확인
 num                0
date_time          0
기온(°C)          6720
풍속(m/s)         6720
습도(%)           6720
강수량(mm, 6시간)    8400
일조(hr, 3시간)     6720
비전기냉방설비운영          0
태양광보유              0
build_kind         0
dtype: int64


## 시간, 요일, 주말여부 추가

In [14]:
# 시간, 요일, 주말여부(new!) 추가
def time(x):
    return int(x[-2:])
train['time']=train['date_time'].apply(lambda x: time(x))
test['time']=test['date_time'].apply(lambda x: time(x))

# 평일=0~4, 주말=5~6
def weekday(x):
    return pd.to_datetime(x[:10]).weekday()
train['weekday']=train['date_time'].apply(lambda x :weekday(x))
test['weekday']=test['date_time'].apply(lambda x :weekday(x))

# 평일=0, 주말=1
train['weekend']=train['weekday'].apply(lambda x: 0 if x < 4 else 1)
test['weekend']=test['weekday'].apply(lambda x: 0 if x < 4 else 1)

In [15]:
train.head(7)

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3,0,0,0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3,1,0,0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3,2,0,0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,3,0,0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3,4,0,0
5,1,2020-06-01 05,8010.576,16.9,3.4,93.0,0.0,0.0,0.0,0.0,3,5,0,0
6,1,2020-06-01 06,7978.176,16.7,3.4,90.0,0.1,0.0,0.0,0.0,3,6,0,0


In [16]:
test.head(7)  # 전력사용량 없음

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,3,0,1,0
1,1,2020-08-25 01,,,,,,0.0,0.0,3,1,1,0
2,1,2020-08-25 02,,,,,,0.0,0.0,3,2,1,0
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,0.0,0.0,3,3,1,0
4,1,2020-08-25 04,,,,,,0.0,0.0,3,4,1,0
5,1,2020-08-25 05,,,,,,0.0,0.0,3,5,1,0
6,1,2020-08-25 06,26.1,1.3,83.0,0.0,0.0,0.0,0.0,3,6,1,0


## 기온, 풍속, 습도 등, 기타 결측치 보간

In [17]:
# 기온, 풍속, 습도 등, 기타 결측치를 적당히 1/3, 2/3 수치로 보간해줍니다.
test = test.interpolate(method='values')  
test.head(3)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,0.0,0.0,3,0,1,0
1,1,2020-08-25 01,27.633333,1.366667,75.333333,0.0,0.0,0.0,0.0,3,1,1,0
2,1,2020-08-25 02,27.466667,1.233333,76.666667,0.0,0.0,0.0,0.0,3,2,1,0


## 건물별 더미변수화 (수정 -> 클러스터링)

In [18]:
# train = pd.get_dummies(train, columns = ['build_kind'])

In [19]:
# test = pd.get_dummies(test, columns = ['build_kind'])

In [20]:
train

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3,0,0,0
1,1,2020-06-01 01,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3,1,0,0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3,2,0,0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,3,0,0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122395,60,2020-08-24 19,4114.368,27.8,2.3,68.0,0.0,0.7,1.0,1.0,5,19,0,0
122396,60,2020-08-24 20,3975.696,27.3,1.2,71.0,0.0,0.0,1.0,1.0,5,20,0,0
122397,60,2020-08-24 21,3572.208,27.3,1.8,71.0,0.0,0.0,1.0,1.0,5,21,0,0
122398,60,2020-08-24 22,3299.184,27.1,1.8,74.0,0.0,0.0,1.0,1.0,5,22,0,0


# EDA

# 모델링

## 타깃값 정의 

In [21]:
train_x=train.drop('전력사용량(kWh)', axis=1)
train_y=train[['전력사용량(kWh)']]

In [22]:
train_x.head(7)

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,2020-06-01 00,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3,0,0,0
1,1,2020-06-01 01,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3,1,0,0
2,1,2020-06-01 02,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3,2,0,0
3,1,2020-06-01 03,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,3,0,0
4,1,2020-06-01 04,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3,4,0,0
5,1,2020-06-01 05,16.9,3.4,93.0,0.0,0.0,0.0,0.0,3,5,0,0
6,1,2020-06-01 06,16.7,3.4,90.0,0.1,0.0,0.0,0.0,3,6,0,0


In [23]:
train_y.head(7) 

Unnamed: 0,전력사용량(kWh)
0,8179.056
1,8135.64
2,8107.128
3,8048.808
4,8043.624
5,8010.576
6,7978.176


## data_time 제거

In [24]:
train_x.drop('date_time', axis=1, inplace=True)
test.drop('date_time', axis=1, inplace=True)

In [25]:
train_x.head()

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3,0,0,0
1,1,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3,1,0,0
2,1,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3,2,0,0
3,1,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,3,0,0
4,1,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3,4,0,0


In [26]:
test.head()

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,27.8,1.5,74.0,0.0,0.0,0.0,0.0,3,0,1,0
1,1,27.633333,1.366667,75.333333,0.0,0.0,0.0,0.0,3,1,1,0
2,1,27.466667,1.233333,76.666667,0.0,0.0,0.0,0.0,3,2,1,0
3,1,27.3,1.1,78.0,0.0,0.0,0.0,0.0,3,3,1,0
4,1,26.9,1.166667,79.666667,0.0,0.0,0.0,0.0,3,4,1,0


In [27]:
# train_x.drop('num', axis=1, inplace=True)
# test.drop('num', axis=1, inplace=True)

In [28]:
train_x.head()

Unnamed: 0,num,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,build_kind,time,weekday,weekend
0,1,17.6,2.5,92.0,0.8,0.0,0.0,0.0,3,0,0,0
1,1,17.7,2.9,91.0,0.3,0.0,0.0,0.0,3,1,0,0
2,1,17.5,3.2,91.0,0.0,0.0,0.0,0.0,3,2,0,0
3,1,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,3,0,0
4,1,17.0,3.3,92.0,0.0,0.0,0.0,0.0,3,4,0,0


## MinMaxScaler

In [29]:
# Scaler = MinMaxScaler()
# Scaler.fit(train_x)

## KFold

In [30]:
cross=KFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in cross.split(train_x, train_y):
    folds.append((train_idx, valid_idx))

In [31]:
from sklearn.ensemble import RandomForestRegressor

In [32]:
models={}
models2={}
for fold in range(5):
    print(f'===================={fold+1}=======================')
    train_idx, valid_idx=folds[fold]
    X_train=train_x.iloc[train_idx, :]
    y_train=train_y.iloc[train_idx, :]
    X_valid=train_x.iloc[valid_idx, :]
    y_valid=train_y.iloc[valid_idx, :]

    # # Light GBM
    # model=LGBMRegressor(n_estimators=100) # 1000은 과적합
    # model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
    #          early_stopping_rounds=30, verbose=100)
    # models[fold]=model

    # RandomForest
    model2=RandomForestRegressor(n_estimators=100) # 1000은 과적합
    model2.fit(X_train, y_train)
    models2[fold] = model2


    # todolist
    # Y_preds = model.predict(X_valid)
    Y_preds2 = model2.predict(X_valid)
    # test_MSE = mean_squared_error(Y_preds, y_valid)
    test_MSE2 = mean_squared_error(Y_preds2, y_valid)
    # print('Test MSE', test_MSE)
    print('Test MSE', test_MSE2)
    # print('Test MSE', (test_MSE+test_MSE2)/2)


    print()
    print(f'================================================\n\n')





Test MSE 41427.41842115654







Test MSE 46827.67361738715







Test MSE 40326.11694503825







Test MSE 40812.010040237285







Test MSE 47992.181036885384





In [33]:
for i in range(5):
    submission['answer'] += models2[i].predict(test)/5

In [34]:
submission

Unnamed: 0,num_date_time,answer
0,1 2020-08-25 00,8673.538320
1,1 2020-08-25 01,8667.902016
2,1 2020-08-25 02,8665.750656
3,1 2020-08-25 03,8657.597520
4,1 2020-08-25 04,8641.197936
...,...,...
10075,60 2020-08-31 19,3945.601152
10076,60 2020-08-31 20,3969.504576
10077,60 2020-08-31 21,3893.918400
10078,60 2020-08-31 22,3703.235328


In [37]:
#제출
submission.to_csv('baseline_submission10.csv', index=False)

In [36]:

# 2 건물 번호 catalog  o
#1 kfold에 다양한 방법 넣기 ex ngboost o randomforest x
# minmaxscaler o
# 건물 분류해서 넣기