In [1]:
import numpy as np
import pandas as pd
import math as ma
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
train_days = 246 # 学習データの日数
test_days  = 40   # テストデータの日数
pred_days  = 31   # 予測データの日数
all_features = ['year', 'month', 'day', 'sin_month', 'cos_month', 'sin_day', 'cos_day','sin_hour', 'cos_hour', 'hour', 'humidity', 'windspeed', 'temp','cloudcover', 'rain', 'generation']
features = ['sin_month', 'cos_month', 'sin_day','sin_hour','cos_day','sin_hour','humidity', 'windspeed','cloudcover', 'rain', 'generation']
_features = [elem for elem in all_features if elem not in features]

In [3]:
##データ読み込み
missing_rate = "20"

for m in range(missing_rate):
PV_data = pd.read_csv(f"_MCAR_{missing_rate}%.csv")

daytime_tr_va = PV_data.loc[:48*(train_days+test_days)-1].query('hour >= 7 and hour <=18')
night_tr_va = PV_data.loc[:48*(train_days+test_days)-1].query('hour < 7 or hour > 18')
test = PV_data.loc[48*(train_days+test_days):]

In [4]:
complete_data = daytime_tr_va.dropna().reset_index(drop = True)
missing_data = daytime_tr_va[daytime_tr_va['generation'].isnull()]

In [5]:
tr_CompData = complete_data

X_train = tr_CompData.loc[:,features[:-1]]
y_train = tr_CompData.loc[:,features[-1]]                          

X_test = missing_data.loc[:,features[:-1]]
_X_test =  missing_data[_features].reset_index(drop = True)

In [6]:
%%time
from sklearn.model_selection import GridSearchCV

# 検証したいパラメータの指定
search_gs = {
"max_depth": [5,10,20,30,40],
"n_estimators":[50,100,150,200,250],
"min_samples_split": [4,8,12,16],
"max_leaf_nodes": [None, 10,20,30],
}

model_gs = RandomForestRegressor()
# グリットサーチの設定
gs = GridSearchCV(model_gs,
                  search_gs,
                  cv = 5,
                  )
# 学習
gs.fit(X_train, y_train)
# 最適なパラメータの表示

Wall time: 16min 1s


GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [5, 10, 20, 30, 40],
                         'max_leaf_nodes': [None, 10, 20, 30],
                         'min_samples_split': [4, 8, 12, 16],
                         'n_estimators': [50, 100, 150, 200, 250]})

In [7]:
# 最適なパラメータの表示
print(gs.best_params_)

{'max_depth': 40, 'max_leaf_nodes': None, 'min_samples_split': 8, 'n_estimators': 50}


In [8]:
best = gs.best_estimator_
y_test = best.predict(X_test)

In [9]:
y_test = pd.DataFrame(y_test,columns = ['generation'])
X_test = X_test.reset_index(drop=True)
test_CompMissData = pd.concat([_X_test,X_test,y_test],axis =1)

In [10]:
imputed_PV_data = pd.concat([complete_data,test_CompMissData],axis=0)
sorted_result = pd.concat([imputed_PV_data,night_tr_va,test],axis=0).sort_values(['year','month','day','hour']).reset_index(drop = True)

In [11]:
sorted_result.to_csv(f"__RF_{missing_rate}%.csv",index = False)