In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.0.0-cp37-none-manylinux1_x86_64.whl (76.4 MB)
[K     |████████████████████████████████| 76.4 MB 37 kB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.0


In [2]:
import warnings
warnings.filterwarnings(action='ignore')
import os, sys
from tqdm import tqdm

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
from lightgbm import LGBMRegressor
import catboost as catb
from catboost import CatBoostRegressor

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [3]:
train = pd.read_csv('/content/drive/MyDrive/Dacon_Solar/train/train.csv')
submission = pd.read_csv('/content/drive/MyDrive/Dacon_Solar/sample_submission.csv')

In [4]:
def sum_hour_minute(train):
    """
    input: df
    output: df
    summary:
    df를 받아서 Hour와 Minute을 합쳐주는 함수
    
    """
    # 0과 30으로 되어 있는 minute을 0과 0.5로 바꿉니다.
    train.Minute = round(train.Minute * 0.017, 1)
    # hour에 minute을 더해줍니다.
    train.Hour = train.Hour + train.Minute
    
    return train

In [5]:
train = sum_hour_minute(train)

In [6]:
# 타겟 값을 만들어줍니다.
train['TARGET_1'] = train.TARGET.shift(-48)
train['TARGET_2'] = train.TARGET_1.shift(-48)

In [7]:
# 학습에 쓰지 않을 변수들은 버려줍니다.
train.drop(['Day', 'Minute'], axis=1, inplace=True)

In [8]:
train.columns

Index(['Hour', 'DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET', 'TARGET_1',
       'TARGET_2'],
      dtype='object')

In [9]:
# shift할 컬럼 지정
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']

# shift할 기간 지정
periods = 6 # n일치를 넣겠다라고 지정(t일 포함) # 6일치가 가장 높은 성능을 보임
lags = list(np.arange(48, periods*48, 48))

In [10]:
def shift_columns(data, shifted_columns, lags):
    
    temp = data.copy()
    for lag in lags:
        for col in shifted_columns:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            
    return temp

In [11]:
# shift되지 않은 train은 test를 만들기 위해 필요합니다.
# 새로운 df로 받아줍시다.
shifted_train = shift_columns(train, shifted_columns, lags)
shifted_train.shape

(52560, 39)

In [12]:
shifted_train.dropna(inplace=True) # shift되기 때문에 nan값이 생겨 dropna 해줘야 함.
shifted_train.shape

(52224, 39)

In [13]:
# 하루 48개 idx X 7일 X 52주 X 2년 + 48개 idx(앞에 짤렸음) = 34992
X = shifted_train[:34992]
val = shifted_train[34992:]

In [14]:
def make_train_test_val(X, val):

    x_train = X.iloc[:, X.columns != 'TARGET_1']
    x_train = x_train.iloc[:, x_train.columns != 'TARGET_2']
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val.iloc[:, val.columns != 'TARGET_1']
    x_val = x_val.iloc[:, x_val.columns != 'TARGET_2']
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    print(x_train.shape)
    print(y_train_1.shape)
    print(y_train_2.shape)
    print(x_val.shape)
    print(y_val_1.shape)
    print(y_val_2.shape)
    
    return x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2

In [15]:
x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2 = make_train_test_val(X, val)

(34992, 37)
(34992,)
(34992,)
(17232, 37)
(17232,)
(17232,)


In [16]:
# test set load

df_test = []

for i in range(81):
    file_path = '/content/drive/MyDrive/Dacon_Solar/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    # shifted
    temp = shift_columns(temp, shifted_columns, lags)
    # drop
    temp.drop(['Day', 'Minute'], axis=1, inplace=True)
    
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 37)


In [17]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000,
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

In [18]:
# Target1
lgbm_models_1, results_1 = train_data(x_train, y_train_1, x_val, y_val_1, X_test)

# Target2
lgbm_models_2, results_2 = train_data(x_train, y_train_2, x_val, y_val_2, X_test)

0.1
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[114]	valid_0's quantile: 1.45124
0.2
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[140]	valid_0's quantile: 2.33869
0.3
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's quantile: 2.78474
Early stopping, best iteration is:
[311]	valid_0's quantile: 2.776
0.4
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's quantile: 2.88855
Early stopping, best iteration is:
[243]	valid_0's quantile: 2.88262
0.5
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's quantile: 2.75777
Early stopping, best iteration is:
[367]	valid_0's quantile: 2.75405
0.6
Training until validation scores don't improve for 300 rounds.
[500]	valid_0's quantile: 2.45842
Early stopping, best iteration is:
[374]	valid_0's quantile: 2.45314
0.7
Training until validation scores don't improve

In [19]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[14:33]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
14,0.csv_Day7_7h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.csv_Day7_7h30m,0.37,0.35,0.48,0.59,1.11,1.52,2.06,2.94,4.88
16,0.csv_Day7_8h00m,2.24,2.92,3.54,3.88,4.97,5.85,6.57,7.53,7.99
17,0.csv_Day7_8h30m,3.22,6.01,6.74,6.16,9.25,10.18,12.53,12.24,15.03
18,0.csv_Day7_9h00m,4.84,7.76,10.29,12.02,15.25,16.48,20.25,21.97,20.01
19,0.csv_Day7_9h30m,6.39,9.97,12.12,15.83,19.97,17.66,25.95,27.58,29.79
20,0.csv_Day7_10h00m,8.31,12.16,15.48,18.22,24.76,20.79,29.08,28.32,32.04
21,0.csv_Day7_10h30m,10.45,16.67,20.98,23.5,28.33,28.5,36.74,36.43,34.78
22,0.csv_Day7_11h00m,13.53,19.56,26.08,31.54,34.03,37.09,40.58,42.35,42.27
23,0.csv_Day7_11h30m,13.74,19.03,23.7,30.78,33.0,37.25,42.48,44.48,41.95


In [20]:
submission_lgbm_shifted = submission.copy()

In [21]:
# base model
def CATB(X_train, Y_train, X_valid, Y_valid, X_test):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    # (a) Modeling  
    model = CatBoostRegressor(objective='Quantile',
                              iterations=100000,
                              learning_rate=0.027)            
                         
                         
    model.fit(X_train,
              Y_train,
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300,
              verbose=500) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    
    CATB_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        CATB_actual_pred = pd.concat([CATB_actual_pred, pred*(q+0.5)],axis=1) #pred를 컨캣해줍니다.

    CATB_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return model, CATB_actual_pred

In [22]:
# Target1
models_1, results_1 = CATB(x_train, y_train_1, x_val, y_val_1, X_test)
# Target2
models_2, results_2 = CATB(x_train, y_train_2, x_val, y_val_2, X_test)

0:	learn: 8.6976866	test: 8.7241896	best: 8.7241896 (0)	total: 59.9ms	remaining: 1h 39m 54s
500:	learn: 2.3221834	test: 2.7292416	best: 2.7212339 (397)	total: 4.91s	remaining: 16m 14s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 2.721233861
bestIteration = 397

Shrink model to first 398 iterations.
0:	learn: 8.7029060	test: 8.7194443	best: 8.7194443 (0)	total: 15.8ms	remaining: 26m 19s
500:	learn: 2.3835048	test: 2.8171050	best: 2.8139327 (447)	total: 4.89s	remaining: 16m 10s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 2.81393267
bestIteration = 447

Shrink model to first 448 iterations.


In [23]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[14:33]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
14,0.csv_Day7_7h00m,-0.012,-0.014,-0.016,-0.018,-0.02,-0.022,-0.024,-0.026,-0.028
15,0.csv_Day7_7h30m,1.032,1.204,1.376,1.548,1.72,1.892,2.064,2.236,2.408
16,0.csv_Day7_8h00m,2.478,2.891,3.304,3.717,4.13,4.543,4.956,5.369,5.782
17,0.csv_Day7_8h30m,5.448,6.356,7.264,8.172,9.08,9.988,10.896,11.804,12.712
18,0.csv_Day7_9h00m,8.994,10.493,11.992,13.491,14.99,16.489,17.988,19.487,20.986
19,0.csv_Day7_9h30m,12.474,14.553,16.632,18.711,20.79,22.869,24.948,27.027,29.106
20,0.csv_Day7_10h00m,15.048,17.556,20.064,22.572,25.08,27.588,30.096,32.604,35.112
21,0.csv_Day7_10h30m,18.276,21.322,24.368,27.414,30.46,33.506,36.552,39.598,42.644
22,0.csv_Day7_11h00m,20.382,23.779,27.176,30.573,33.97,37.367,40.764,44.161,47.558
23,0.csv_Day7_11h30m,20.058,23.401,26.744,30.087,33.43,36.773,40.116,43.459,46.802


In [24]:
submission_catb_shifted = submission.copy()

In [25]:
train = pd.read_csv('/content/drive/MyDrive/Dacon_Solar/train/train.csv')
train = sum_hour_minute(train)

In [26]:
# shift할 컬럼 지정
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']

# shift할 기간 지정
periods = 3 # n일치를 넣겠다라고 지정(t일 포함) # 6일치가 가장 높은 성능을 보임
lags = list(np.arange(48, periods*48, 48))

In [27]:
train = shift_columns(train, shifted_columns, lags)
train.shape

(52560, 21)

In [28]:
# Hour의 값을 48개로 받아서 48개의 각기 다른 train set을 만들어줍니다.

trains = []

for i in range(48):
    idx = i*0.5
    globals()['train_{}'.format(i)] = train[train.Hour == idx]
    globals()['train_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    globals()['train_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    globals()['train_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['train_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    trains.append(globals()['train_{}'.format(i)])

In [29]:
train_0.shape, train_15.shape, train_45.shape

((1091, 21), (1091, 21), (1091, 21))

In [30]:
# shift 조건이 바뀌었기 때문에 test set을 다시 받아줍니다.

df_test = []

for i in range(81):
    file_path = '/content/drive/MyDrive/Dacon_Solar/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    temp = shift_columns(temp, shifted_columns, lags)
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 21)


In [31]:
# 마찬가지로 48개의 test set을 만들어줍니다.

tests = []

for i in range(48):
    idx = i*0.5
    globals()['test_{}'.format(i)] = X_test[X_test.Hour == idx]
    globals()['test_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    tests.append(globals()['test_{}'.format(i)])

In [32]:
train_0.shape, test_0.shape

((1091, 21), (81, 19))

In [34]:
# 결과 값을 받을 리스트를 만듭니다.
results_1 = []
results_2 = []

for tr, te in zip(trains, tests):
    try:
        # 단일 값이니 Hour를 지워줍니다.
        tr.drop(['Hour'], axis=1, inplace=True)
        te.drop(['Hour'], axis=1, inplace=True)
    except:
        pass
    
    # tr을 X, val 로 나눠줍니다.
    X = tr[:730]
    val = tr[730:]
    
    # train, val, x, y 로 다 찢어줍니다.
    x_train = X[X.columns[:-2]]
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val[X.columns[:-2]]
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    _, result_1 = train_data(x_train, y_train_1, x_val, y_val_1, te)
    _, result_2 = train_data(x_train, y_train_2, x_val, y_val_2, te)
    
    results_1.append(result_1)
    results_2.append(result_2)

0.1
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.2
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.3
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.4
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.5
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.6
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.7
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.8
Training until validation scores don't improve for 300 rounds.
Early stopping, best iteration is:
[1

In [35]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_1): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i, 1:] = res.values
    
for i, res in enumerate(results_2): # 똑같이 해줍니다.
    submission.iloc[rg+i+48, 1:] = res.values

In [36]:
submission_48models = submission.copy()

In [37]:
multi_res = submission_48models.set_index('id').values
catb_res = submission_catb_shifted.set_index('id').values
lgbm_res = submission_lgbm_shifted.set_index('id').values

In [38]:
ens_res = (multi_res + lgbm_res + catb_res) / 3

In [39]:
# 0이 아닌 0~0.1 사이의 값이 너무 많습니다.
# score에 감점이 될까하여 0으로 전부 맞춰주겠습니다.

for r in range(ens_res.shape[0]):
    for c in range(ens_res.shape[1]):
        if ens_res[r, c] < 0.1:
            ens_res[r, c] = 0

In [40]:
submission.loc[:, "q_0.1":] = ens_res

In [None]:
 submission.to_csv('/content/drive/MyDrive/Dacon_Solar/submission_210126-3_ensemble-best-trio-finalcode.csv', index=False)