In [None]:
pip install catboost

In [62]:
import warnings
warnings.filterwarnings(action='ignore')
import os, sys
from tqdm import tqdm

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
from lightgbm import LGBMRegressor
import catboost as catb
from catboost import CatBoostRegressor

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [63]:
train = pd.read_csv('/content/drive/MyDrive/Dacon_Solar/train/train.csv')
submission = pd.read_csv('/content/drive/MyDrive/Dacon_Solar/sample_submission.csv')

In [64]:
def sum_hour_minute(train):
    """
    input: df
    output: df
    summary:
    df를 받아서 Hour와 Minute을 합쳐주는 함수
    
    """
    # 0과 30으로 되어 있는 minute을 0과 0.5로 바꿉니다.
    train.Minute = round(train.Minute * 0.017, 1)
    # hour에 minute을 더해줍니다.
    train.Hour = train.Hour + train.Minute
    
    return train

In [65]:
train = sum_hour_minute(train)

In [66]:
# 타겟 값을 만들어줍니다.
train['TARGET_1'] = train.TARGET.shift(-48)
train['TARGET_2'] = train.TARGET_1.shift(-48)

In [67]:
# 학습에 쓰지 않을 변수들은 버려줍니다.
train.drop(['Day', 'Minute'], axis=1, inplace=True)

In [68]:
train.columns

Index(['Hour', 'DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET', 'TARGET_1',
       'TARGET_2'],
      dtype='object')

In [69]:
# shift할 컬럼 지정
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']

# shift할 기간 지정
periods = 6 # n일치를 넣겠다라고 지정(t일 포함) # 6일치가 가장 높은 성능을 보임
lags = list(np.arange(48, periods*48, 48))

In [70]:
def shift_columns(data, shifted_columns, lags):
    
    temp = data.copy()
    for lag in lags:
        for col in shifted_columns:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            
    return temp

In [71]:
# shift되지 않은 train은 test를 만들기 위해 필요합니다.
# 새로운 df로 받아줍시다.
shifted_train = shift_columns(train, shifted_columns, lags)
shifted_train.shape

(52560, 39)

In [72]:
shifted_train.dropna(inplace=True) # shift되기 때문에 nan값이 생겨 dropna 해줘야 함.
shifted_train.shape

(52224, 39)

In [73]:
# 하루 48개 idx X 7일 X 52주 X 2년 + 48개 idx(앞에 짤렸음) = 34992
X = shifted_train[:34992]
val = shifted_train[34992:]

In [74]:
def make_train_test_val(X, val):

    x_train = X.iloc[:, X.columns != 'TARGET_1']
    x_train = x_train.iloc[:, x_train.columns != 'TARGET_2']
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val.iloc[:, val.columns != 'TARGET_1']
    x_val = x_val.iloc[:, x_val.columns != 'TARGET_2']
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    print(x_train.shape)
    print(y_train_1.shape)
    print(y_train_2.shape)
    print(x_val.shape)
    print(y_val_1.shape)
    print(y_val_2.shape)
    
    return x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2

In [75]:
x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2 = make_train_test_val(X, val)

(34992, 37)
(34992,)
(34992,)
(17232, 37)
(17232,)
(17232,)


In [76]:
# test set load

df_test = []

for i in range(81):
    file_path = '/content/drive/MyDrive/Dacon_Solar/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    # shifted
    temp = shift_columns(temp, shifted_columns, lags)
    # drop
    temp.drop(['Day', 'Minute'], axis=1, inplace=True)
    
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 37)


In [77]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000,
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

In [None]:
# Target1
lgbm_models_1, results_1 = train_data(x_train, y_train_1, x_val, y_val_1, X_test)

# Target2
lgbm_models_2, results_2 = train_data(x_train, y_train_2, x_val, y_val_2, X_test)

In [None]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[14:33]

In [80]:
submission_lgbm_shifted = submission.copy()

In [81]:
# base model
def CATB(X_train, Y_train, X_valid, Y_valid, X_test):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    # (a) Modeling  
    model = CatBoostRegressor(objective='Quantile',
                              iterations=100000,
                              learning_rate=0.027)            
                         
                         
    model.fit(X_train,
              Y_train,
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300,
              verbose=500) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    
    CATB_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        CATB_actual_pred = pd.concat([CATB_actual_pred, pred*(q+0.5)],axis=1) #pred를 컨캣해줍니다.

    CATB_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return model, CATB_actual_pred

In [None]:
# Target1
models_1, results_1 = CATB(x_train, y_train_1, x_val, y_val_1, X_test)
# Target2
models_2, results_2 = CATB(x_train, y_train_2, x_val, y_val_2, X_test)

In [None]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[14:33]

In [84]:
submission_catb_shifted = submission.copy()

In [85]:
train = pd.read_csv('/content/drive/MyDrive/Dacon_Solar/train/train.csv')
train = sum_hour_minute(train)

In [86]:
# shift할 컬럼 지정
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']

# shift할 기간 지정
periods = 3 # n일치를 넣겠다라고 지정(t일 포함) # 6일치가 가장 높은 성능을 보임
lags = list(np.arange(48, periods*48, 48))

In [87]:
train = shift_columns(train, shifted_columns, lags)
train.shape

(52560, 21)

In [88]:
# Hour의 값을 48개로 받아서 48개의 각기 다른 train set을 만들어줍니다.

trains = []

for i in range(48):
    idx = i*0.5
    globals()['train_{}'.format(i)] = train[train.Hour == idx]
    globals()['train_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    globals()['train_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    globals()['train_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['train_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    trains.append(globals()['train_{}'.format(i)])

In [89]:
train_0.shape, train_15.shape, train_45.shape

((1091, 21), (1091, 21), (1091, 21))

In [92]:
# shift 조건이 바뀌었기 때문에 test set을 다시 받아줍니다.

df_test = []

for i in range(81):
    file_path = '/content/drive/MyDrive/Dacon_Solar/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    temp = shift_columns(temp, shifted_columns, lags)
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 21)


In [93]:
# 마찬가지로 48개의 test set을 만들어줍니다.

tests = []

for i in range(48):
    idx = i*0.5
    globals()['test_{}'.format(i)] = X_test[X_test.Hour == idx]
    globals()['test_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    tests.append(globals()['test_{}'.format(i)])

In [94]:
train_0.shape, test_0.shape

((1091, 21), (81, 19))

In [None]:
# 결과 값을 받을 리스트를 만듭니다.
results_1 = []
results_2 = []

for tr, te in zip(trains, tests):
    try:
        # 단일 값이니 Hour를 지워줍니다.
        tr.drop(['Hour'], axis=1, inplace=True)
        te.drop(['Hour'], axis=1, inplace=True)
    except:
        pass
    
    # tr을 X, val 로 나눠줍니다.
    X = tr[:730]
    val = tr[730:]
    
    # train, val, x, y 로 다 찢어줍니다.
    x_train = X[X.columns[:-2]]
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val[X.columns[:-2]]dsj
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    _, result_1 = train_data(x_train, y_train_1, x_val, y_val_1, te)
    _, result_2 = train_data(x_train, y_train_2, x_val, y_val_2, te)
    
    results_1.append(result_1)
    results_2.append(result_2)

In [96]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_1): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i, 1:] = res.values
    
for i, res in enumerate(results_2): # 똑같이 해줍니다.
    submission.iloc[rg+i+48, 1:] = res.values

In [97]:
submission_48models = submission.copy()

In [98]:
multi_res = submission_48models.set_index('id').values
catb_res = submission_catb_shifted.set_index('id').values
lgbm_res = submission_lgbm_shifted.set_index('id').values

In [99]:
ens_res = (multi_res + lgbm_res + catb_res) / 3

In [100]:
# 0이 아닌 0~0.1 사이의 값이 너무 많습니다.
# score에 감점이 될까하여 0으로 전부 맞춰주겠습니다.

for r in range(ens_res.shape[0]):
    for c in range(ens_res.shape[1]):
        if ens_res[r, c] < 0.1:
            ens_res[r, c] = 0

In [101]:
submission.loc[:, "q_0.1":] = ens_res

In [102]:
submission.to_csv('/content/drive/MyDrive/Dacon_Solar/submission_210126-3_ensemble-best-trio-finalcode.csv', index=False)