In [1]:
import numpy as np
import pandas as pd
import math as ma
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
train_days = 246 # 学習データの日数
valid_days  = 40   # 検証データの日数
test_days  = 31   # テストデータの日数
all_features = ['year', 'month', 'day', 'sin_month', 'cos_month', 'sin_day', 'cos_day','sin_hour', 'cos_hour', 'hour', 'humidity', 'windspeed', 'temp','cloudcover', 'rain', 'generation']
features = ['sin_month', 'cos_month', 'sin_day', 'cos_day','sin_hour','cos_hour','humidity', 'windspeed', 'temp','cloudcover', 'rain', 'generation']
#features = ['sin_hour', 'cos_hour', 'sin_month','cos_month','humidity','windspeed','temp','cloudcover','generation']
_features = [elem for elem in all_features if elem not in features]

In [3]:
##データ読み込み
missing_rate = '20'
PV_data = pd.read_csv(f"_MCAR_{missing_rate}%.csv")

daytime_tr_va = PV_data.loc[:48*(train_days+valid_days)-1].query('hour >= 7 and hour <=18')
night_tr_va = PV_data.loc[:48*(train_days+valid_days)-1].query('hour < 7 or hour > 18')
test = PV_data.loc[48*(train_days+valid_days):]

In [4]:
complete_data = daytime_tr_va.dropna().reset_index(drop = True)
missing_data = daytime_tr_va[daytime_tr_va['generation'].isnull()]

In [5]:
tr_CompData = complete_data.sample(frac = 0.8)
tr_va_CompData = pd.concat([complete_data,tr_CompData],axis = 0)
va_CompData = tr_va_CompData.drop_duplicates(keep = False)

tr_CompData = complete_data
X_train = tr_CompData.loc[:,features[:-1]]
y_train = tr_CompData.loc[:,features[-1]]

X_valid = va_CompData.loc[:,features[:-1]]
y_valid = va_CompData.loc[:,features[-1]]

X_test = missing_data.loc[:,features[:-1]]
_X_test =  missing_data[_features].reset_index(drop = True)

In [6]:
%%time
fit_params = {'verbose': 0,  # 学習中のコマンドライン出力
              'early_stopping_rounds': 100,  # 学習時、評価指標がこの回数連続で改善しなくなった時点でストップ
              'eval_metric': 'rmse',  # early_stopping_roundsの評価指標
              'eval_set': [((X_valid, y_valid))]  # early_stopping_roundsの評価指標算出用データ
              }
# 最終的なパラメータ範囲
cv_params = {'learning_rate': [0.05, 0.1],
             'n_estimators':[10000],
             'max_depth': [5,10,15,20],
             'colsample_bytree': [0.5,0.7,0.9,1.0],
             'colsample_bylevel': [0.5],
             'subsample': [0.2, 0.5, 0.8, 1.0],
             'gamma': [0.0],
             'alpha': [0.0],
             'lambda': [1.0],
             'min_child_weight': [1,2,4,8,16,32],
             }


model = XGBRegressor()
# グリッドサーチのインスタンス作成
gridcv = GridSearchCV(model, cv_params, cv=5,
                      scoring='neg_mean_squared_error', n_jobs=-1)
# グリッドサーチ実行（学習実行）
gridcv.fit(X_train, y_train, **fit_params)
# 最適パラメータの表示と保持
best_params = gridcv.best_params_
best_score = gridcv.best_score_
print(f'最適パラメータ {best_params}\nスコア {best_score}')



最適パラメータ {'alpha': 0.0, 'colsample_bylevel': 0.5, 'colsample_bytree': 1.0, 'gamma': 0.0, 'lambda': 1.0, 'learning_rate': 0.05, 'max_depth': 15, 'min_child_weight': 8, 'n_estimators': 10000, 'subsample': 0.8}
スコア -2.667340677279751
Wall time: 33min 22s


In [7]:
best = gridcv.best_estimator_
y_test = best.predict(X_test)

In [8]:
y_test = pd.DataFrame(y_test,columns = ['generation'])
X_test = X_test.reset_index(drop=True)
test_CompMissData = pd.concat([_X_test,X_test,y_test],axis =1)
imputed_PV_data = pd.concat([complete_data,test_CompMissData],axis=0)
sorted_result = pd.concat([imputed_PV_data,night_tr_va,test],axis=0).sort_values(['year','month','day','hour']).reset_index(drop = True)

In [9]:
sorted_result.to_csv(f"__XGB_{missing_rate}%.csv",index = False)

In [10]:
##memo
"""
5%: 34min 5s
10%:30min 37s
15%:30min 1s
20%:30min 02s
40%:22min
60%:12min 15s
80%:5min 19s
"""

'\n5%: 34min 5s\n10%:30min 37s\n15%:30min 1s\n20%:30min 02s\n40%:22min\n60%:12min 15s\n80%:5min 19s\n'