In [1]:
####### ライブラリのインポート ######
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
import math as ma
from sklearn.model_selection import GridSearchCV

In [2]:
####### 学習・検証・テスト期間の設定 ######
train_days = 246 # 学習データの日数
valid_days = 40 # テストデータの日数
test_days = 31 # 予測データの日数

all_features = ['year', 'month', 'day', 'sin_month', 'cos_month', 'sin_day', 'cos_day','sin_hour', 'cos_hour', 'hour', 'humidity', 'windspeed', 'temp','cloudcover', 'rain', 'generation']
features = ['cos_month','sin_hour', 'cos_hour','humidity','temp','cloudcover', 'rain','generation']
_features = [elem for elem in all_features if elem not in features]

In [3]:
missing_rate = "20" #欠損率
PV_data = pd.read_csv(f"_MCAR_{missing_rate}%.csv")

daytime_tr_va = PV_data.loc[:48*(train_days+test_days)-1].query('hour >= 7 and hour <=18')
night_tr_va = PV_data.loc[:48*(train_days+test_days)-1].query('hour < 7 or hour > 18')
test = PV_data.loc[48*(train_days+test_days):]

In [4]:
complete_data = daytime_tr_va.dropna().reset_index(drop = True) #欠損していないデータ
missing_data = daytime_tr_va[daytime_tr_va['generation'].isnull()] #欠損しているデータ

sc_x = StandardScaler()
sc_y = StandardScaler()

X_train = complete_data.loc[:,features[:-1]]
y_train = complete_data.loc[:,features[-1]]

X_train = sc_x.fit_transform(X_train.values)
y_train = sc_y.fit_transform(y_train.values.reshape(-1, 1))

X_test = missing_data.loc[:,features[:-1]] 
_X_test =  missing_data[_features].reset_index(drop = True) #補完したいデータの入力Xのうち、all_featuresに含まれないもの。後でソートするために使う

In [5]:
####### KNNの実行 ######
search_gs = {
"n_neighbors": [5,10,20,40,80,120,130,150,170,200],
}

model_gs = KNeighborsRegressor()
# グリットサーチの設定
gs = GridSearchCV(model_gs,
                  search_gs,
                  cv = 5,
                  )
# 学習
gs.fit(X_train, y_train)



#imputer = KNNImputer(n_neighbors=50, weights='uniform')
#imputed = imputer.fit_transform(std_daytime_tr_va)
#imputed = pd.DataFrame(imputed)
#imputed.columns = features

GridSearchCV(cv=5, estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [5, 10, 20, 40, 80, 120, 130, 150, 170,
                                         200]})

In [6]:
# 最適なパラメータの表示
print(gs.best_params_)

{'n_neighbors': 10}


In [7]:
best = gs.best_estimator_
y_test = best.predict(X_test)



In [8]:
y_test = sc_y.inverse_transform(y_test)
y_test = pd.DataFrame(y_test,columns = ['generation'])
X_test = X_test.reset_index(drop=True)
test = pd.concat([_X_test,X_test,y_test],axis =1)

In [9]:
imputed_PV_data = pd.concat([complete_data,test],axis=0)
sorted_result = pd.concat([imputed_PV_data,night_tr_va,test],axis=0).sort_values(['year','month','day','hour']).reset_index(drop = True)

In [10]:
sorted_result.to_csv(f"KNN_{missing_rate}%.csv",index = False)