In [1]:
import numpy as np 
import pandas as pd 
import lightgbm
from lightgbm import LGBMRegressor 
import catboost
from catboost import CatBoostRegressor
import sklearn
from sklearn.model_selection import KFold
import IPython
from IPython.display import clear_output
import copy

In [2]:
print('python:{}'.format('3.7.3'))
print('numpy:{}'.format(np.__version__))
print('pandas:{}'.format(pd.__version__))
print('lightgbm:{}'.format(lightgbm.__version__))
print('catboost:{}'.format(catboost.__version__))
print('sklearn:{}'.format(sklearn.__version__))
print('IPython:{}'.format(IPython.__version__))
print('copy:{}'.format('기본모듈'))
print('os:{}'.format('window10'))
print('cpu:{}'.format('i7-8750h'))
print('gpu:{}'.format('gtx1050ti'))
print('ram:{}'.format('ddr4 2667hz 8GB * 2'))

python:3.7.3
numpy:1.19.5
pandas:1.2.4
lightgbm:3.2.1
catboost:0.25.1
sklearn:0.23.2
IPython:7.15.0
copy:기본모듈
os:window10
cpu:i7-8750h
gpu:gtx1050ti
ram:ddr4 2667hz 8GB * 2


In [3]:
FILE_PATH = '.\\data'

cat_mae_params = {
    'objective': 'MAE',
    'n_estimators': 10000,
    'early_stopping_rounds': 4, 
} #catboost hyper parameter

lgbm_mae_params = {
    'objective': 'MAE',
    'boosting_type': 'goss',
    'n_estimators': 10000,
    'early_stopping_round': 15, 
    'num_leaves':39,
} #lightgbm hyper parameter

In [4]:
def CDH(xs): #cooling degree hour를 구현
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys) 

def detect_outliers(df,ratio): #iqr 이상치제거 
    outlier_indices = [] 
    Q1 = np.percentile(df, 25) 
    Q3 = np.percentile(df, 75) 
    IQR = Q3 - Q1 
    outlier_step = ratio * IQR 
    return ~(df < Q1 - outlier_step) | (df > Q3 + outlier_step)

In [5]:
train_df = pd.read_csv(FILE_PATH+'\\train.csv', encoding = "cp949") #train_csv

train_df['date_time'] = pd.to_datetime(train_df['date_time'])
train_df['dayofyear'] = train_df['date_time'].dt.dayofyear
train_df['hour'] = train_df['date_time'].dt.hour
train_df['weekday'] = train_df['date_time'].dt.weekday #time feature

train_df['hour_te'] = np.sin(2*np.pi*(train_df['hour'])/23)
train_df['hour_te1'] = np.cos(2*np.pi*(train_df['hour'])/23) #time encoding hour

t = 9/5*train_df['기온(°C)']
train_df['불쾌지수'] = t - 0.55*(1-train_df['습도(%)']/100)*(t-26)+32
train_df['불쾌지수'] = pd.cut(train_df['불쾌지수'], bins = [0, 68, 75, 80, 200], labels = [1,2,3,4]) #불쾌지수는 카테고리로 나누는게 성능상승에 도움이 됨                                 

train_dfs  = []
for i in range(1,61):
    train_dfs.append(train_df[train_df['num']==i]) 
    
    
for i in range(len(train_dfs)):
    train_dfs[i] = train_dfs[i].drop(columns=['풍속(m/s)','강수량(mm)','일조(hr)','num',
                                              'date_time','비전기냉방설비운영','태양광보유']) #쓸모없는 특징 drop
    

test_df = pd.read_csv(FILE_PATH+'\\test.csv', encoding = "cp949")
test_df['date_time'] = pd.to_datetime(test_df['date_time'])

for i in range(1,61):
    test_df[test_df['num']==i] = test_df[test_df['num']==i].interpolate(method='values') #기상예보값 interpolate 
    
test_df['dayofyear'] = test_df['date_time'].dt.dayofyear
test_df['hour'] = test_df['date_time'].dt.hour
test_df['weekday'] = test_df['date_time'].dt.weekday #time feature

test_df['hour_te'] = np.sin(2*np.pi*(test_df['hour'])/23)
test_df['hour_te1'] = np.cos(2*np.pi*(test_df['hour'])/23) #time encoding hour

t = 9/5*test_df['기온(°C)']
test_df['불쾌지수'] = t - 0.55*(1-test_df['습도(%)']/100)*(t-26)+32
test_df['불쾌지수'] = pd.cut(test_df['불쾌지수'], bins = [0, 68, 75, 80, 200], labels = [1,2,3,4]) #불쾌지수는 카테고리로 나누는게 성능상승에 도움이 됨
    
test_dfs  = []
for i in range(1,61):
    test_dfs.append(test_df[test_df['num']==i])
    
for i in range(len(test_dfs)):
    test_dfs[i] = test_dfs[i].drop(columns=['풍속(m/s)','강수량(mm, 6시간)','일조(hr, 3시간)','num',
                                            'date_time','비전기냉방설비운영','태양광보유']) #쓸모없는 특징 drop

    
for i in range(60): #cdh 특징 추가
    train_dfs[i]['cdh'] = CDH(np.concatenate([train_dfs[i]['기온(°C)'].values,test_dfs[i]['기온(°C)'].values]))[:-len(test_dfs[i])]
    test_dfs[i]['cdh'] = CDH(np.concatenate([train_dfs[i]['기온(°C)'].values,test_dfs[i]['기온(°C)'].values]))[-len(test_dfs[i]):]

    
#train_x와 train_y로 나눔    
train_x = [] 
train_y = []
for i in range(len(train_dfs)):
    train_x.append(copy.deepcopy(train_dfs[i][train_dfs[i].columns[1:]])) 
    train_y.append(copy.deepcopy(train_dfs[i][train_dfs[i].columns[0]]))
#이상치 제거 iqr은 1.25
for i in range(60):    
    idx = detect_outliers(train_y[i],1.25)
    train_y[i] = train_y[i][idx]
    train_x[i] = train_x[i][idx]

In [6]:
#과적합 방지를 위해 여러 k_fold로 반복하도록 설정
#특징중 몇개를 뺄경우 성능 향상을 기대할수 있고 과적합 또한 방지 가능하다
random_seed = 0
dcs = [[],['기온(°C)'], ['습도(%)'], ['hour_te','hour_te1'], ['불쾌지수'], ['cdh']]
ks = [2,3,4,5,6,7,8,9,10,4]

In [7]:
train_dfs[0]

Unnamed: 0,전력사용량(kWh),기온(°C),습도(%),dayofyear,hour,weekday,hour_te,hour_te1,불쾌지수,cdh
0,8179.056,17.6,92.0,153,0,0,0.000000e+00,1.000000,1,-8.4
1,8135.640,17.7,91.0,153,1,0,2.697968e-01,0.962917,1,-16.7
2,8107.128,17.5,91.0,153,2,0,5.195840e-01,0.854419,1,-25.2
3,8048.808,17.1,91.0,153,3,0,7.308360e-01,0.682553,1,-34.1
4,8043.624,17.0,92.0,153,4,0,8.878852e-01,0.460065,1,-43.1
...,...,...,...,...,...,...,...,...,...,...
2035,8714.952,29.4,66.0,237,19,0,-8.878852e-01,0.460065,3,43.7
2036,8740.224,28.7,69.0,237,20,0,-7.308360e-01,0.682553,3,47.3
2037,8730.504,28.3,71.0,237,21,0,-5.195840e-01,0.854419,3,48.7
2038,8725.968,28.3,72.0,237,22,0,-2.697968e-01,0.962917,3,48.4


In [8]:
answer_df = pd.read_csv(FILE_PATH+'\\sample_submission.csv', encoding = "cp949")

for dc in dcs:#d특정 feature dc를 drop 시킴
    for k in ks:#kfold 의 nspilt 의 값 k
        folds = []
        for i in range(len(train_dfs)):
            cross=KFold(n_splits=k,shuffle=True,random_state=random_seed)
            fold=[]
            for train_idx, valid_idx in cross.split(train_x[i], train_y[i]):
                fold.append((train_idx, valid_idx))
            folds.append(fold)
            
        for i in range(len(train_dfs)):
            for fold in range(k):
                print(dc,random_seed,k,i)
                train_idx, valid_idx = folds[i][fold]
                X_train=np.array(train_x[i].drop(columns=dc).iloc[train_idx])
                y_train=np.array(train_y[i].iloc[train_idx])
                X_valid=np.array(train_x[i].drop(columns=dc).iloc[valid_idx])
                y_valid=np.array(train_y[i].iloc[valid_idx])
                #catboost 학습 
                model=CatBoostRegressor(**cat_mae_params)
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)
                v = model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns])) * 0.3
                #lgbm 학습 
                model=LGBMRegressor(**lgbm_mae_params)
                model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=100)        
                v += model.predict(np.array(test_dfs[i][train_x[i].drop(columns=dc).columns])) * 0.7
                
                answer_df['answer'].iloc[(i)*168:(i+1)*168] += v/(len(ks)*k*len(dcs))
                clear_output(True) 
                
        random_seed += 1
answer_df.to_csv(FILE_PATH+'\\answer.csv', index=False) #파일 저장

['cdh'] 59 4 59
0:	learn: 325.8970393	test: 329.6017457	best: 329.6017457 (0)	total: 9.83ms	remaining: 1m 38s
100:	learn: 104.6315428	test: 114.0862015	best: 114.0862015 (100)	total: 137ms	remaining: 13.5s
200:	learn: 86.8133323	test: 98.8330558	best: 98.8330558 (200)	total: 265ms	remaining: 12.9s
300:	learn: 77.4305400	test: 92.4628681	best: 92.4628681 (300)	total: 391ms	remaining: 12.6s
Stopped by overfitting detector  (4 iterations wait)

bestTest = 89.84457727
bestIteration = 385

Shrink model to first 386 iterations.
Training until validation scores don't improve for 15 rounds
[100]	valid_0's l1: 81.473




[200]	valid_0's l1: 79.4796
[300]	valid_0's l1: 78.39
[400]	valid_0's l1: 77.7507
Early stopping, best iteration is:
[453]	valid_0's l1: 77.4105


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
