# 超參數及集成學習權重最佳化

我們的目的是要以前一天的氣象預報來預測當天的電力資訊，但是我們手上的氣象預報歷史資料是從2024年七月開始蒐集，直接拿來預測電力資訊天數不夠，  
所以除了 Power_predict.ipynb 裡面敘述的從氣象觀測歷史資料預測電力資料的模型之外，我們還需要建立從氣象預報資料來預測氣象觀測資料的模型。  

這裡的氣象資料預測的模型建立方式跟電力資料的模型大同小異，主要的不同點是氣象資料預測可以把每天每站的數據當成一個樣本，這樣我們就可以在相對短時間之內累積足夠的樣本數。

同時這個筆記本也要處理超參數的最佳化，我們使用 optuna 這個第三方套件來達成這個任務。  
另外我們也嘗試最佳化集成學習時各模型的權重，具體方法為計算出模型之間的誤差相關矩陣，再從這個矩陣解出最佳權重組合。

整個預測系統在真實世界資料中運行的情形可以到<a href='http://ec2-54-206-30-159.ap-southeast-2.compute.amazonaws.com:8501'> 這個網站 </a> 查看

## 初始化

### 匯入模組與套件

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
#這兩行讓 matplotlib 的圖可以顯示中文，同時正常顯示負號
matplotlib.rc('font', family='Microsoft JhengHei')
plt.rcParams['axes.unicode_minus'] = False
import datetime
from copy import deepcopy
import os
import joblib
import json
from tqdm import tqdm
import optuna

# 設置Optuna日誌級別為 WARNING，僅顯示警告及以上級別的信息
optuna.logging.set_verbosity(optuna.logging.WARNING)

pd.set_option('future.no_silent_downcasting', True)

In [2]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR, NuSVR

In [3]:
from sklearn.svm import SVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
from sklearn.metrics import f1_score

In [5]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [7]:
from Pytorch_models.metrics import Array_Metrics
from Pytorch_models import models as pytorch_models
from Pytorch_models import api
MAE = Array_Metrics.mae
R2_score = Array_Metrics.r2

In [8]:
from utils.prepare_data import prepare_forecast_observation_df, prepare_data

In [9]:
def FCN_model(input_f, output_f, feature_counts, dropout_factor=0, L2_factor=1e-15, mode='regressor'):
    if mode == 'regressor':
        model = pytorch_models.SimpleNN(input_f, output_f, feature_counts, dropout_factor)
    elif mode == 'classifier':
        model = pytorch_models.SimpleNN_classifier(input_f, output_f, feature_counts, dropout_factor)
    Model_API = api.Model_API(model, L2_factor=L2_factor, classifier=(mode=='classifier'))
    return Model_API

### 初始參數

In [10]:
data_path = './historical/data/'

# 資料的開始與結束日期
start_date = '2023-01-01'
end_date = '2024-09-20'

train_model_path = f'./trained_model_parameters/model_meta_{end_date}/'

In [11]:
# 設定要不要開啟快速測試模式
speed_test = False

#---------------------------------------------------------------------------------
# 設定要不要重算所有超參數與權重，或是不要做任何重算
# 如果兩者皆為 False 則按照下面的個別設定
# 如果兩者皆為 True 則全部重算
#
# Note: 如果指定的路徑當中沒有相應的 meta.json 檔，程式將會無視這邊的設定而進行計算
#---------------------------------------------------------------------------------
rerun_all_calculation = False
dont_run_any_calculation = False


# 此值為 False 則重新計算，True 則從存檔中讀取
optuna_has_done = {
    '日照率': True,
    '最高氣溫': True,
    '最低氣溫': True,
    '氣溫': True,
    '風速': False,
    '風力': True,
    '太陽能': True,
    '尖峰負載': True,
    '夜尖峰': True,
    '午後平均風速': False,
    '午後平均氣溫': True,
    '下午平均風速': False,
    '下午平均氣溫': True,
    '傍晚平均風速': False,
    '傍晚平均氣溫': True,
}

weights_has_determined = {
    '日照率': True,
    '最高氣溫': True,
    '最低氣溫': True,
    '氣溫': True,
    '風速': False,
    '風力': True,
    '太陽能': True,
    '尖峰負載': True,
    '夜尖峰': True,
    '午後平均風速': False,
    '午後平均氣溫': True,
    '下午平均風速': False,
    '下午平均氣溫': True,
    '傍晚平均風速': False,
    '傍晚平均氣溫': True,
}

# 如果前面有設定全部重算或都不要重算，則重設上面的這兩個字典
if dont_run_any_calculation:
    optuna_has_done = {k: True for k in optuna_has_done.keys()}
    weights_has_determined = {k: True for k in weights_has_determined.keys()}

if rerun_all_calculation:
    optuna_has_done = {k: False for k in optuna_has_done.keys()}
    weights_has_determined = {k: False for k in weights_has_determined.keys()}

In [12]:
# 定義每個 model_label 對應的 model
model_class_dict = {}
model_class_dict['regressor'] = {
    'LinearRegression': LinearRegression,
    'RandomForest': RandomForestRegressor,
    'XGBoost': XGBRegressor,
    'LightGBM': LGBMRegressor,
    'SVR': SVR,
    'NuSVR': NuSVR,
    'FCN': FCN_model,
}
model_class_dict['classifier'] = {
    'RandomForest': RandomForestClassifier,
    'XGBoost': XGBClassifier,
    'LightGBM': LGBMClassifier,
    'SVC': SVC,
    'NuSVC': NuSVC,
    'LogisticRegression': LogisticRegression,
    'FCN': FCN_model,
}

### 讀取資料

讀取先前經由爬蟲定時抓取的預報與觀測資料

In [13]:
forecast_obs_df = prepare_forecast_observation_df(data_path, start_date=start_date, end_date=end_date)
weather_power_df = prepare_data(data_path, start_date=start_date, end_date=end_date)

## 函數

### 超參數最佳化

這部分的函數有：  
1. get_XY: 從 DataFrame 中提取需要的 X 與 Y 兩個 numpy array。
2. five_fold_test: 執行一次 5-fold 測試，會呼叫 get_XY_from_forecast_and_observation。
3. assign_model: 根據 model_label 與超參數字典建立一個模型。
3. hyperparameter_tuning: 針對特定的模型與超參數組合，呼叫 five_fold_test 執行多次 5-fold 測試，並回傳 R2 值。
4. optuna_operation: 利用第三方套件 optuna 執行超參數調整，會呼叫 hyperparameter_tuning。

流程控制函數 flow_control 會呼叫 optuna_operation，而主程式只會直接呼叫 flow_control。

In [14]:
def get_XY(data_df, Y_feature, X_features=None, hours=[str(i) for i in range(0, 24, 3)]):
    date_related_cols = ['日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '白日長度']
    
    if Y_feature in ['最高氣溫', '最低氣溫', '氣溫', '風速', '日照率', '全天空日射量']:
        target = 'obs'
    elif '平均' in Y_feature:
        target = 'obs'
    elif Y_feature in ['風力', '太陽能', '尖峰負載', '夜尖峰']:
        target = 'pwd'

    X_cols = []
    if X_features is None:
        for this_col in data_df.columns:
            if '_' in this_col:
                X_cols.append(this_col)
        if target == 'pwd':
            X_cols += date_related_cols
    else:
        for col in data_df.columns:
            if target == 'obs':
                dash_splited = col.split('預報_')
            elif target == 'pwd':
                dash_splited = col.split('_')
            if len(dash_splited) >= 2:
                if dash_splited[0] in X_features and (target=='pwd' or dash_splited[1] in hours):
                    X_cols.append(col)
            else:
                if col in date_related_cols and col in X_features:
                    X_cols.append(col)

    Xs = np.array(data_df[X_cols])
    Ys = np.array(data_df[Y_feature])

    Xs = Xs[np.invert(np.isnan(Ys)),:]
    Ys = Ys[np.invert(np.isnan(Ys))]

    return Xs, Ys, X_cols

In [15]:
def five_fold_test(Xs, Ys, model=XGBRegressor(), mode='regressor',
                   deep_learning=False, fold_n=5, standard_scale=True, always_test_last_chunk=False, fit_square=False):
    
    def metric(Y_test, Y_pred, mode=mode):
        if mode == 'regressor':
            return 1 - np.mean((Y_test - Y_pred)**2) / np.var(Y_test)
        elif mode == 'classifier':
            return f1_score(Y_test, Y_pred)

    shuffle = not always_test_last_chunk
    kf = KFold(n_splits=fold_n, shuffle=shuffle)
    
    XY_folds = {}
    for i, (train_index, test_index) in enumerate(kf.split(Xs)):
        XY_folds[i] = (train_index, test_index)
    
    metric_test_list, metric_train_list = [], []

    if always_test_last_chunk:
        iters = [fold_n-1]
    else:
        iters = range(fold_n)
    
    for i in iters:
        if deep_learning:
            input_f = model.model.params['input_f']
            output_f = model.model.params['output_f']
            feature_counts = model.model.params['feature_counts']
            dropout_factor = model.model.params['dropout_factor']
            L2_factor = model.L2_factor
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                              dropout_factor=dropout_factor, L2_factor=L2_factor,mode=mode)
            
        X_train = Xs[XY_folds[i][0]]
        X_test = Xs[XY_folds[i][1]]
        Y_train = Ys[XY_folds[i][0]]
        Y_test = Ys[XY_folds[i][1]]

        if fit_square:
            Y_train = Y_train ** 2

        if deep_learning:
            X_train_DL, X_val, Y_train_DL, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
    
        if standard_scale:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            if deep_learning:
                X_val = scaler.transform(X_val)
            
        if deep_learning:
            _ = model.fit(X_train_DL, Y_train_DL, X_val, Y_val)
        else:
            _ = model.fit(X_train, Y_train)
    
        Y_pred = model.predict(X_test)
        if fit_square:
            Y_pred = np.sqrt(np.abs(Y_pred))
        metric_test_list.append(metric(Y_test, Y_pred))

        Y_pred = model.predict(X_train)
        if fit_square:
            Y_pred = np.sqrt(np.abs(Y_pred))
        metric_train_list.append(metric(Y_train, Y_pred))

    metric_test = np.mean(metric_test_list)
    metric_train = np.mean(metric_train_list)
    return metric_test, metric_train

In [16]:
def assign_model(model_label, Xs, cfg, mode):
    
    if model_label == 'LightGBM':
        model = model_class_dict[mode][model_label](force_col_wise=True, verbose=-1, **cfg)
    elif model_label == 'FCN':
        model = model_class_dict[mode][model_label](input_f=Xs.shape[1], output_f=1, feature_counts=[16, 16, 16, 8], mode=mode, **cfg)
    else:
        model = model_class_dict[mode][model_label](**cfg)

    return model

In [17]:
def hyperparameter_tuning(trial, Xs, Ys, mode='regressor',
                          model_label='RandomForest', n_iters=50, always_test_last_chunk=False, fit_square=False):

    deep_learning = model_label in ['FCN']
    standard_scale = not deep_learning

    if model_label in ['RandomForest', 'XGBoost', 'LightGBM']:
        cfg = {'max_depth': trial.suggest_int('max_depth', 2, 15),
               'n_estimators': trial.suggest_int('n_estimators', 10, 200)}     
    elif model_label in ['SVR', 'SVC']:
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])}
    elif model_label in ['NuSVR', 'NuSVC']:
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
               'nu': trial.suggest_float('nu', 0.1, 0.9)}
    elif model_label == 'FCN':
        cfg = {'L2_factor': trial.suggest_float('L2_factor', 1e-3, 1, log=True),
               'dropout_factor': trial.suggest_float('dropout_factor', 0, 0.5)}
    elif model_label == 'LogisticRegression':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'])}
    elif model_label == 'LinearRegression':
        cfg = {}

    model = assign_model(model_label, Xs, cfg, mode)
   
    metric_list = []
    iterator = range(n_iters)
    for i in iterator:
        metric, _ = five_fold_test(Xs, Ys, model, mode=mode, 
                                   deep_learning=deep_learning, standard_scale=standard_scale,
                                   always_test_last_chunk=always_test_last_chunk, fit_square=fit_square)
        metric_list.append(metric)

    return np.mean(metric_list) - np.std(metric_list)

In [18]:
def optuna_operation(model_xcols, Y_feature, data_df, mode='regressor', speed_test=False,
                     optuna_n_trials=20, n_iters=30, always_test_last_chunk=False, fit_wind_square=False):

    if mode == 'regressor':
        metric_name = 'R2'
    elif mode == 'classifier':
        metric_name = 'F1'

    fit_square = fit_wind_square and '風速' in Y_feature
        
    model_hyperparameters_dict = {}
    model_r2_dict = {}
    
    if always_test_last_chunk:
        n_iters = 1

    model_labels = list(model_xcols.keys())
    
    for model_label in model_labels:
        X_features = model_xcols[model_label]
        Xs, Ys, _ = get_XY(data_df, Y_feature, X_features)

        this_n_iters = n_iters
        this_optuna_n_trials = optuna_n_trials

        if model_label == 'FCN':
            this_n_iters = min(this_n_iters, 1)
            if speed_test:
                this_optuna_n_trials = 4

        if model_label == 'LinearRegression':
            this_optuna_n_trials = 1
            this_n_iters = 10
            
        def target_func(trial, model_label=model_label, Xs=Xs, Ys=Ys, mode=mode,
                        n_iters=this_n_iters, always_test_last_chunk=always_test_last_chunk, fit_square=fit_square):
            return hyperparameter_tuning(trial, model_label=model_label, Xs=Xs, Ys=Ys, mode=mode,
                                         n_iters=n_iters, always_test_last_chunk=always_test_last_chunk, fit_square=fit_square)
        
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='maximize')
        with tqdm(total=this_optuna_n_trials) as pbar:
            for _ in range(this_optuna_n_trials):
                study.optimize(target_func, n_trials=1, catch=(Exception,))
                pbar.update(1)
        
        print(model_label)
        for key, v in study.best_params.items():
            print(f"Best {key} = {v}")
        print(f"Best {metric_name} = {study.best_value}")
    
        model_hyperparameters_dict[model_label] = study.best_params
        model_r2_dict[model_label] = study.best_value

    return model_hyperparameters_dict, model_r2_dict

### Ensemble

這部分的函數有：
1. cross_correlation_matrix: 由不同模型的預測誤差產生相關矩陣，僅由 get_residual_corr_matrix 呼叫。
2. sovle_optimal_weights: 由誤差相關矩陣解出最佳權重，僅由 find_optimal_weights 呼叫。
3. predict: 訓練模型並取得預測值，僅由 get_residual_corr_matrix 呼叫。
4. get_residual_corr_matrix: 計算並取得所有模型多次取樣的 Y_truth, Y_pred, 與誤差相關矩陣，僅由 find_optimal_weights 呼叫。
5. get_weighted_ensemble_metric: 輸入 get_residual_corr_matrix 的計算結果之後，得出模型評估表格，僅由 find_optimal_weights 呼叫。
6. find_optimal_weights: 統整以上函數，解出最佳權重，並印出模型評估表格。
7. save_model_metadata: 儲存這份筆記本得到的每個被預測值所採用的模型組合，以及每個模型採用的特徵、超參數與權重。

流程控制函數中會呼叫 find_optimal_weights 與 save_model_metadata

In [19]:
def cross_correlation_matrix(residuals):
    N = len(residuals)
    matrix = np.zeros((N, N))
    for i in range(N):
        for j in range(i, N):
            matrix[i][j] = np.mean(np.array(residuals[i]) * np.array(residuals[j]))

    for i in range(1, N):
        for j in range(i):
            matrix[i][j] = matrix[j][i]

    return matrix

In [20]:
def sovle_optimal_weights(matrix):
    N = matrix.shape[0]
    def objective(weights):
        return weights.T @ matrix @ weights

    initial_weights = np.array([1/N] * N)
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * N
    result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
    
    optimal_weights = result.x
    return optimal_weights

In [21]:
def predict(model_label, Y_train, train_ind, test_ind, mode,
            model_hyperparameters_dict, model_xcols, data_df, Y_feature, fit_wind_square=False):

    fit_square = fit_wind_square and '風速' in Y_feature
    
    if fit_square:
        Y_train = Y_train**2
    
    X_features = model_xcols[model_label]
    Xs, _, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)

    model = assign_model(model_label, Xs, cfg=model_hyperparameters_dict[model_label], mode=mode)

    deep_learning = False
    if model_label == 'FCN':
        deep_learning = True

    X_train = Xs[train_ind]
    X_test = Xs[test_ind]
    
    if deep_learning:
        X_train_dl, X_val, Y_train_dl, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
        _ = model.fit(X_train_dl, Y_train_dl, X_val, Y_val)
    else:
        scaler = StandardScaler()
        X_scaler = scaler.fit(X_train)
        X_train = X_scaler.transform(X_train)
        X_test = X_scaler.transform(X_test)
        _ = model.fit(X_train, Y_train)
    YP = model.predict(X_test)
    if fit_square:
        YP = np.sqrt(np.abs(YP))
    return YP 

In [22]:
def get_residual_corr_matrix(model_hyperparameters_dict, ensemble_models, model_xcols,
                             data_df, Ys, Y_feature, mode,
                             n_iters, n_samples, fit_wind_square=False):
    
    def get_prediction_func(model_hyperparameters_dict=model_hyperparameters_dict,
                            model_xcols=model_xcols,
                            data_df=data_df,
                            Y_feature=Y_feature,
                            mode=mode,
                            fit_wind_square=fit_wind_square):
        def func(model_label, Y_train, train_ind, test_ind):
            return predict(model_label, Y_train, train_ind, test_ind, mode,
                           model_hyperparameters_dict, model_xcols, data_df, Y_feature, fit_wind_square=fit_wind_square)
        return func
        
    get_prediction = get_prediction_func()
    
    Y_pred_iters, Y_test_iters, model_metric = [], [], []
    matrix = np.zeros((len(ensemble_models), len(ensemble_models)))
    for i in tqdm(range(n_iters)):
        train_ind, test_ind, _, _ = train_test_split(np.arange(n_samples), np.arange(n_samples), test_size=0.2)
        
        Y_train = Ys[train_ind]
        Y_test = Ys[test_ind]
        
        Y_preds, this_metric = [], []
        for model_label in ensemble_models:
            YP = get_prediction(model_label, Y_train, train_ind, test_ind)
            if mode == 'regressor':
                this_metric.append(MAE(Y_test, YP))
            elif mode == 'classifier':
                YP[np.where(YP<0.5)] = 0
                YP[np.where(YP>=0.5)] = 1
                this_metric.append(f1_score(Y_test, YP))
            Y_preds.append(YP)
            
        residuals = Y_preds - np.array([Y_test] * len(Y_preds)).reshape(len(Y_preds),-1)
        matrix += cross_correlation_matrix(residuals)

        model_metric.append(this_metric)
        Y_pred_iters.append(Y_preds)
        Y_test_iters.append(Y_test)
    matrix = matrix / n_iters
    return matrix, model_metric, Y_pred_iters, Y_test_iters

In [23]:
def get_weighted_ensemble_metric(Y_pred_iters, Y_test_iters, weights, mode):
    n_iters = len(Y_pred_iters)
    weighted_metric = []
    for i in range(n_iters):
        weighted_YP = np.sum(Y_pred_iters[i] * np.concatenate([weights.reshape(-1,1),] * Y_test_iters[0].shape[0], axis = 1), axis=0)
        if mode == 'regressor':
            weighted_metric.append(MAE(Y_test_iters[i], weighted_YP))
        elif mode == 'classifier':
            weighted_YP[np.where(weighted_YP<0.5)] = 0
            weighted_YP[np.where(weighted_YP>=0.5)] = 1
            weighted_metric.append(f1_score(Y_test_iters[i], weighted_YP))
    weighted_metric = np.array(weighted_metric).reshape(-1, 1)
    return weighted_metric

In [24]:
def find_optimal_weights(model_hyperparameters_dict, model_xcols, 
                         data_df, Y_feature, mode='regressor', fit_wind_square=False,
                         n_iters=200, weights=None):

    if mode == 'regressor':
        metric_name = 'MAE'
    elif mode == 'classifier':
        metric_name = 'F1'       
    
    if weights is None:
        ensemble_models = list(model_hyperparameters_dict.keys())
    else:
        ensemble_models = list(weights.keys())

    n_models = len(ensemble_models)

    X_features = model_xcols[ensemble_models[0]]
    Xs, Ys, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)
    n_samples = Xs.shape[0]

    matrix, model_metric, Y_pred_iters, Y_test_iters = get_residual_corr_matrix(model_hyperparameters_dict=model_hyperparameters_dict,
                                                                                model_xcols=model_xcols, ensemble_models=ensemble_models,
                                                                                data_df=data_df, Ys=Ys, Y_feature=Y_feature, mode=mode,
                                                                                n_iters=n_iters, n_samples=n_samples, fit_wind_square=fit_wind_square)
    
    if weights is None:
        optimal_weights = sovle_optimal_weights(matrix)
    else:
        optimal_weights = weights

    uniform_weights = np.array([1/n_models] * n_models)
    uniform_metric = get_weighted_ensemble_metric(Y_pred_iters, Y_test_iters, uniform_weights, mode)
    optimal_metric = get_weighted_ensemble_metric(Y_pred_iters, Y_test_iters, optimal_weights, mode)

    array_metric = np.concatenate([model_metric, uniform_metric, optimal_metric], axis=1)
    
    metric_dict = {
        'Model': ensemble_models + ['Ensemble', 'Weighted_Ensemble'],
        f'Avg {metric_name}': list(np.mean(array_metric, axis=0)), 
        f'Std {metric_name}': list(np.std(array_metric, axis=0)),
        '90th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.9) - 1]),
        '10th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.1) - 1])
        }
    
    df = pd.DataFrame(metric_dict)
    if mode == 'regressor':
        df = df.sort_values('90th percentile').reset_index(drop=True)
    elif mode == 'classifier':
        df = df.sort_values('10th percentile', ascending=False).reset_index(drop=True)

    if weights is not None:
        return df
        
    optimal_weights_dict = {ensemble_models[i]: w for i, w in enumerate(optimal_weights)}
        
    return df, optimal_weights_dict

In [25]:
def save_model_metadata(file_path, model_xcols, model_hyperparameters_dict, optimal_weights, fit_wind_square=False):
    model_labels = list(model_hyperparameters_dict)
    output_dict = {
        'X_feature_dict':{},
        'hyperparameters_dict':{},
        'weights':{}
    }
    for model_label in model_labels:
        if optimal_weights[model_label] > 0.0005:
            output_dict['X_feature_dict'][model_label] = model_xcols[model_label]
            output_dict['hyperparameters_dict'][model_label] = model_hyperparameters_dict[model_label]
            output_dict['weights'][model_label] = optimal_weights[model_label]
            output_dict['fit_wind_square'] = fit_wind_square

    with open(file_path, 'w') as f:
        json.dump(output_dict, f)

### 流程控制

主要被主程式呼叫的函數  
負責管理超參數及權重的計算與存取

In [26]:
def flow_control(Y_feature, model_xcols, data_df, mode='regressor', speed_test=False, fit_wind_square=False,
                 train_model_path=train_model_path, optuna_has_done=optuna_has_done, weights_has_determined=weights_has_determined, run=False):

    n_iter_dict = {
        'hyper_parameter': 15,
        'ensemble_weight': 200
    }
    if speed_test:
        n_iter_dict = {
            'hyper_parameter': 1,
            'ensemble_weight': 20
        }
    
    this_model_path = f'{train_model_path}{Y_feature}/'
    os.makedirs(this_model_path, exist_ok=True)

    # 如果指定的 meta 檔存在，並且初始參數規定不須重新計算，則套用存檔數值。
    if os.path.exists(f'{train_model_path}{Y_feature}/meta.json') and not run:
        with open(f'{train_model_path}{Y_feature}/meta.json', 'r') as f:
            meta = json.load(f)
    else:
        optuna_has_done[Y_feature] = False
        weights_has_determined[Y_feature] = False

    # 超參數  
    if optuna_has_done[Y_feature]:
        model_xcols = meta['X_feature_dict']
        model_hyperparameters_dict = meta['hyperparameters_dict']
    else: 
        print('Start to tune hyperparameters')
        model_hyperparameters_dict, model_r2_dict = optuna_operation(model_xcols, Y_feature, data_df, mode=mode,
                                                                     n_iters=n_iter_dict['hyper_parameter'],
                                                                     speed_test=speed_test, fit_wind_square=fit_wind_square)
    
    # 集成權重
    if weights_has_determined[Y_feature]:
        optimal_weights = meta['weights']
        df = pd.read_csv(f'{this_model_path}predict_MAE.df')
        display(df)
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')
    else:
        print('Start to determine Ensemble weights.')
        if 'FCN' in model_hyperparameters_dict.keys():
            n_iters = int(n_iter_dict['ensemble_weight']/4)
        else:
            n_iters = n_iter_dict['ensemble_weight']
        df, optimal_weights = find_optimal_weights(model_hyperparameters_dict, model_xcols, data_df,
                                                   Y_feature=Y_feature, mode=mode, n_iters=n_iters, fit_wind_square=fit_wind_square)
        print(Y_feature)
        display(df)
        df.to_csv(f'{this_model_path}predict_MAE.df', index=False, encoding='utf-8-sig')
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')

    if not (weights_has_determined[Y_feature] and optuna_has_done[Y_feature]):
        print(' ')
        print(' ')
        print('**Copy and Paste following lines into the next cell.**')
        for model_label in model_hyperparameters_dict.keys():
            print('##### ' + model_label)
            for key, v in model_hyperparameters_dict[model_label].items():
                print(f"Best {key} = {v}  ")
            if 'model_r2_dict' in locals().keys():
                print(f"Best R2 = {model_r2_dict[model_label]}  ")
            print(f'Weight = {optimal_weights[model_label]:.3f}')
        print(' ')
        print(' ')
        save_model_metadata(this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights, fit_wind_square=fit_wind_square)

## 預測氣象數值

在先前對於如何用氣象資料預測電力資料的探索，我們發現中央氣象署提供的歷史氣象資料中，以每天的最高氣溫、最低氣溫、平均氣溫、風速與全天空日射量等五個數值比較重要  
同時從中央氣象署網站下載的氣象預報當中，可以找到每個鄉鎮市區每三個小時的天氣狀況、氣溫、風速、風向、相對溼度等資訊  
這邊的天氣狀況就是晴、多雲、短暫陣雨之類的文字敘述，根據我的觀察，這樣的文字敘述可以歸類為七種：  
晴、多雲、陰、短暫陣雨、短暫陣雨或雷雨、午後短暫雷陣雨、陣雨或雷雨
在我的資料表中預先將這七種預報文字進行 one-hot coding 處理

我判斷這樣的文字敘述跟日照率，也就是實測日照時數與天文日照時數之比率比較相關，所以這邊預測的氣象數值鎖定在以下五個值：  
日照率、最高氣溫、最低氣溫、(平均)氣溫、風速  

而預測的樣本單位則是每天每氣象站算是一個樣本，而考慮的氣象站為：  
臺北站、高雄站、嘉義站、東吉島站、臺中電廠站、臺西站等六站

另外在每個預測標的下方的結果表格中，為了瞭解每個模型以及每種集成學習方式分別的預測誤差，以及誤差值的穩定性  
所以我將 MAE 的平均值、標準差、第 10 與第 90 百分位都列出來  
表格是按誤差的第 90 百分位排序的，以觀察各預測方式的穩定性
除了夜尖峰是預測 Yes or No 問題，屬於分類問題而非回歸問題  
所以表格呈現的是 f1-score，也相應的改成按照第 10 百分位排序

表格中的 Ensemble 代表每個模型權重一致的集成學習  
而 Weighted_Ensemble 則代表使用從相關矩陣中解出的最佳權重的集成學習

### 時段風速與氣溫

這邊的時段風速與氣溫，指的是三個時段：午後 (12-15點)、下午 (15-18點)、傍晚(18-21點) 的平均氣溫與風速  
由於用電尖峰通常發生在這三個時段，所以後面的風力發電與夜尖峰的預測會用到這些數據  
氣溫單位為攝氏度，風速單位則為公尺每秒  

In [27]:
for des in ['午後', '下午', '傍晚']:
    model_xcols = {
        'LinearRegression': ['溫度'],
        'SVR': ['溫度'],
        'RandomForest': ['溫度'],
        'XGBoost': ['溫度'],
        'LightGBM': ['溫度'],
        'NuSVR': ['溫度'],
        'FCN': ['溫度'],
    }
    Y_feature = f'{des}平均氣溫'
    flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

    model_xcols = {
        'LinearRegression': ['風速'],
        'SVR': ['風速'],
        'RandomForest': ['風速'],
        'XGBoost': ['風速'],
        'LightGBM': ['風速'],
        'NuSVR': ['風速'],
        'FCN': ['風速'],
    }
    Y_feature = f'{des}平均風速'
    flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test, fit_wind_square=True)
         

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,1.078018,0.094974,1.178233,0.968758
1,Ensemble,1.083475,0.095161,1.187678,0.968621
2,NuSVR,1.078986,0.099214,1.201246,0.965935
3,LinearRegression,1.102121,0.092887,1.20664,0.989686
4,LightGBM,1.113256,0.094178,1.20847,0.9957
5,SVR,1.089891,0.103722,1.218282,0.974176
6,FCN,1.123264,0.095492,1.227442,1.013245
7,RandomForest,1.127773,0.097628,1.233911,0.994727
8,XGBoost,1.112958,0.096221,1.25199,1.003066


Weights:
LinearRegression: 0.227
XGBoost: 0.066
LightGBM: 0.119
NuSVR: 0.433
FCN: 0.156
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.80it/s]


LinearRegression
Best R2 = -0.02520508564552247


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:18<00:00,  1.09it/s]


SVR
Best C = 5.20736765872185
Best kernel = rbf
Best R2 = 0.26257639290392437


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:56<00:00,  5.82s/it]


RandomForest
Best max_depth = 5
Best n_estimators = 74
Best R2 = 0.1802528276045749


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:09<00:00,  6.46s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 12
Best R2 = 0.1634249992803321


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:20<00:00,  4.04s/it]


LightGBM
Best max_depth = 8
Best n_estimators = 11
Best R2 = 0.16348641804977146


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:12<00:00,  1.60it/s]


NuSVR
Best C = 12.471360260522362
Best kernel = rbf
Best nu = 0.7165833704330286
Best R2 = 0.2745168431162034


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [21:58<00:00, 65.91s/it]


FCN
Best L2_factor = 0.004955102808359177
Best dropout_factor = 0.24288830910267165
Best R2 = 0.32184147559100007
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:56<00:00, 13.14s/it]

午後平均風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,1.002182,0.092588,1.098517,0.843339
1,Ensemble,1.020497,0.092475,1.126411,0.878914
2,NuSVR,1.009729,0.102317,1.141623,0.874753
3,SVR,1.010553,0.107925,1.150765,0.867648
4,XGBoost,1.071829,0.099298,1.160839,0.916862
5,RandomForest,1.056845,0.111469,1.173682,0.882493
6,LightGBM,1.126711,0.088933,1.248177,1.001529
7,FCN,1.107631,0.113104,1.252971,0.955211
8,LinearRegression,1.196483,0.118975,1.339432,1.055407


Weights:
LinearRegression: 0.033
SVR: 0.483
RandomForest: 0.484
XGBoost: 0.000
LightGBM: 0.000
NuSVR: 0.000
FCN: 0.000
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = -0.02520508564552247  
Weight = 0.033
##### SVR
Best C = 5.20736765872185  
Best kernel = rbf  
Best R2 = 0.26257639290392437  
Weight = 0.483
##### RandomForest
Best max_depth = 5  
Best n_estimators = 74  
Best R2 = 0.1802528276045749  
Weight = 0.484
##### XGBoost
Best max_depth = 2  
Best n_estimators = 12  
Best R2 = 0.1634249992803321  
Weight = 0.000
##### LightGBM
Best max_depth = 8  
Best n_estimators = 11  
Best R2 = 0.16348641804977146  
Weight = 0.000
##### NuSVR
Best C = 12.471360260522362  
Best kernel = rbf  
Best nu = 0.7165833704330286  
Best R2 = 0.2745168431162034  
Weight = 0.000
##### FCN
Best L2_factor = 0.004955102808359177  
Best dropout_factor = 0.24288830910267165  
Best R2 = 0.32184147559100007  
Weight = 0.000
 
 


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,NuSVR,1.319703,0.111506,1.464719,1.179274
1,Ensemble,1.318578,0.110094,1.472572,1.170328
2,SVR,1.31664,0.112722,1.47298,1.160374
3,Weighted_Ensemble,1.32602,0.111329,1.473505,1.178444
4,RandomForest,1.343418,0.11453,1.48669,1.174172
5,LinearRegression,1.348328,0.112095,1.487398,1.191166
6,FCN,1.367762,0.109083,1.48982,1.215688
7,LightGBM,1.34579,0.115132,1.502017,1.183165
8,XGBoost,1.375264,0.121037,1.541133,1.230657


Weights:
LinearRegression: 0.352
SVR: 0.107
LightGBM: 0.493
FCN: 0.048
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 18.13it/s]


LinearRegression
Best R2 = -0.0882267530453936


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.53it/s]


SVR
Best C = 0.12467067855675402
Best kernel = linear
Best R2 = 0.24957382545393883


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:18<00:00,  6.92s/it]


RandomForest
Best max_depth = 3
Best n_estimators = 135
Best R2 = 0.13818528294977458


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:04<00:00,  6.24s/it]


XGBoost
Best max_depth = 4
Best n_estimators = 13
Best R2 = 0.0813880483059969


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:24<00:00,  4.23s/it]


LightGBM
Best max_depth = 2
Best n_estimators = 10
Best R2 = 0.13042419702529384


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:20<00:00,  1.01s/it]


NuSVR
Best C = 0.6225845929786307
Best kernel = linear
Best nu = 0.6133620347889677
Best R2 = 0.24747549848371875


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [20:51<00:00, 62.58s/it]


FCN
Best L2_factor = 0.002133285120167835
Best dropout_factor = 0.007216057518136032
Best R2 = 0.2687619225901626
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:09<00:00, 12.19s/it]

下午平均風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,1.014088,0.102731,1.157188,0.888793
1,Ensemble,1.033694,0.098433,1.172044,0.903196
2,SVR,1.035964,0.118454,1.201574,0.904009
3,NuSVR,1.036417,0.12202,1.214014,0.897305
4,XGBoost,1.080253,0.132894,1.25029,0.920299
5,RandomForest,1.114282,0.105871,1.259177,0.986002
6,LightGBM,1.189418,0.089899,1.308725,1.070308
7,FCN,1.228553,0.149985,1.464592,1.042529
8,LinearRegression,1.326177,0.11975,1.488832,1.162493


Weights:
LinearRegression: 0.000
SVR: 0.055
RandomForest: 0.047
XGBoost: 0.336
LightGBM: 0.000
NuSVR: 0.442
FCN: 0.120
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = -0.0882267530453936  
Weight = 0.000
##### SVR
Best C = 0.12467067855675402  
Best kernel = linear  
Best R2 = 0.24957382545393883  
Weight = 0.055
##### RandomForest
Best max_depth = 3  
Best n_estimators = 135  
Best R2 = 0.13818528294977458  
Weight = 0.047
##### XGBoost
Best max_depth = 4  
Best n_estimators = 13  
Best R2 = 0.0813880483059969  
Weight = 0.336
##### LightGBM
Best max_depth = 2  
Best n_estimators = 10  
Best R2 = 0.13042419702529384  
Weight = 0.000
##### NuSVR
Best C = 0.6225845929786307  
Best kernel = linear  
Best nu = 0.6133620347889677  
Best R2 = 0.24747549848371875  
Weight = 0.442
##### FCN
Best L2_factor = 0.002133285120167835  
Best dropout_factor = 0.007216057518136032  
Best R2 = 0.2687619225901626  
Weight = 0.120
 
 


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,SVR,1.132275,0.095506,1.243309,1.00863
1,Weighted_Ensemble,1.145617,0.08777,1.276634,1.030431
2,Ensemble,1.146985,0.088408,1.281582,1.027256
3,XGBoost,1.153788,0.091155,1.290694,1.036758
4,RandomForest,1.157758,0.088535,1.293512,1.04359
5,LinearRegression,1.168403,0.087938,1.299662,1.058633
6,LightGBM,1.167679,0.091318,1.306343,1.043081
7,NuSVR,1.173529,0.08871,1.306912,1.07196
8,FCN,1.194605,0.092358,1.316559,1.06282


Weights:
LinearRegression: 0.328
SVR: 0.076
RandomForest: 0.181
XGBoost: 0.236
LightGBM: 0.179
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.81it/s]


LinearRegression
Best R2 = -0.033036349212044215


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:25<00:00,  1.29s/it]


SVR
Best C = 7.691606146773484
Best kernel = rbf
Best R2 = 0.32820719025427636


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:49<00:00,  5.49s/it]


RandomForest
Best max_depth = 4
Best n_estimators = 130
Best R2 = 0.18767562178740438


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:04<00:00,  6.24s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 39
Best R2 = 0.07998334437238264


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:18<00:00,  3.93s/it]


LightGBM
Best max_depth = 5
Best n_estimators = 12
Best R2 = 0.11732587768263822


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.26it/s]


NuSVR
Best C = 1.3421357736528097
Best kernel = linear
Best nu = 0.4542505339782846
Best R2 = 0.3170529513360416


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [21:56<00:00, 65.81s/it]


FCN
Best L2_factor = 0.3754961292967383
Best dropout_factor = 0.2341101160155981
Best R2 = 0.30201548420693436
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:06<00:00, 12.13s/it]

傍晚平均風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,SVR,1.009598,0.116084,1.137198,0.848649
1,Weighted_Ensemble,1.021515,0.104349,1.138751,0.871781
2,Ensemble,1.056754,0.103811,1.183309,0.926811
3,NuSVR,1.065303,0.119279,1.214924,0.889943
4,RandomForest,1.109989,0.108553,1.254347,0.962726
5,XGBoost,1.147017,0.128575,1.3199,0.971724
6,LightGBM,1.263525,0.096467,1.392267,1.142869
7,LinearRegression,1.270554,0.117307,1.417564,1.116969
8,FCN,1.20823,0.177608,1.432881,1.028768


Weights:
LinearRegression: 0.109
SVR: 0.422
RandomForest: 0.419
XGBoost: 0.012
LightGBM: 0.000
NuSVR: 0.000
FCN: 0.038
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = -0.033036349212044215  
Weight = 0.109
##### SVR
Best C = 7.691606146773484  
Best kernel = rbf  
Best R2 = 0.32820719025427636  
Weight = 0.422
##### RandomForest
Best max_depth = 4  
Best n_estimators = 130  
Best R2 = 0.18767562178740438  
Weight = 0.419
##### XGBoost
Best max_depth = 2  
Best n_estimators = 39  
Best R2 = 0.07998334437238264  
Weight = 0.012
##### LightGBM
Best max_depth = 5  
Best n_estimators = 12  
Best R2 = 0.11732587768263822  
Weight = 0.000
##### NuSVR
Best C = 1.3421357736528097  
Best kernel = linear  
Best nu = 0.4542505339782846  
Best R2 = 0.3170529513360416  
Weight = 0.000
##### FCN
Best L2_factor = 0.3754961292967383  
Best dropout_factor = 0.2341101160155981  
Best R2 = 0.30201548420693436  
Weight = 0.038
 
 


### 日照率

雖然實際上跟太陽能發電量比較相關的是全天空日射量  
但是在 EDA 環節我們可以看到，日照率乘上天文日射量之後，跟全天空日射量有 r=0.9 左右的極高相關性  
而天文日射量是給定日期就可以確切計算出來的  
所以這邊選擇日照率為預測標的  
單位為百分點  

In [28]:
# 被預測的標的
Y_feature = '日照率'
# 定義集成學習使用的模型以及模型們各自使用的 X 特徵
model_xcols = {
        'RandomForest': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'XGBoost': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'LightGBM': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'SVR': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'NuSVR': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'FCN': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
    }

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,13.724232,1.265057,15.823561,12.367681
1,NuSVR,14.010632,1.251222,15.866179,12.504193
2,Ensemble,13.825605,1.331217,15.868444,12.235621
3,SVR,13.982383,1.348907,16.190432,12.579805
4,RandomForest,14.425971,1.420879,16.437925,12.650702
5,XGBoost,14.651025,1.3205,16.505014,12.892847
6,LightGBM,14.908139,1.274033,16.591049,13.356433
7,FCN,15.189024,1.961579,17.664378,13.50608


Weights:
RandomForest: 0.023
XGBoost: 0.048
LightGBM: 0.022
SVR: 0.355
NuSVR: 0.535
FCN: 0.017


### 高溫

In [29]:
Y_feature = '最高氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.930084,0.073084,1.036,0.844932
1,LightGBM,0.943561,0.074739,1.043709,0.847259
2,Ensemble,0.943078,0.071127,1.047541,0.854023
3,RandomForest,0.945324,0.082488,1.060299,0.841518
4,NuSVR,0.98976,0.071677,1.078746,0.869444
5,FCN,0.974109,0.073453,1.08133,0.872478
6,XGBoost,0.972966,0.071131,1.08283,0.888023
7,LinearRegression,0.990408,0.070896,1.085608,0.870938
8,SVR,0.981885,0.07583,1.086924,0.882946


Weights:
FCN: 0.136
RandomForest: 0.193
SVR: 0.113
NuSVR: 0.150
LightGBM: 0.408


### 低溫

In [30]:
Y_feature = '最低氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.865184,0.069045,0.944697,0.757778
1,Ensemble,0.868761,0.07091,0.949468,0.768341
2,LightGBM,0.874402,0.067795,0.953516,0.77193
3,SVR,0.879758,0.070074,0.957287,0.783502
4,RandomForest,0.878051,0.070032,0.96456,0.791404
5,NuSVR,0.883149,0.067862,0.971904,0.797249
6,LinearRegression,0.887116,0.068727,0.977431,0.805291
7,FCN,0.898014,0.079212,0.998602,0.795967
8,XGBoost,0.921519,0.080335,1.035659,0.807275


Weights:
RandomForest: 0.353
SVR: 0.180
NuSVR: 0.120
LightGBM: 0.347


### 平均溫

In [31]:
Y_feature = '氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,RandomForest,0.711521,0.050407,0.770361,0.642632
1,Ensemble,0.711179,0.050515,0.776704,0.644002
2,SVR,0.720519,0.053813,0.777985,0.657972
3,Weighted_Ensemble,0.7063,0.049497,0.778695,0.634442
4,NuSVR,0.728022,0.048299,0.782948,0.651726
5,LinearRegression,0.730712,0.049582,0.783847,0.662053
6,LightGBM,0.718991,0.052838,0.786109,0.644412
7,XGBoost,0.726659,0.054398,0.800574,0.657318
8,FCN,0.757449,0.055351,0.816248,0.680629


Weights:
RandomForest: 0.308
XGBoost: 0.176
SVR: 0.042
NuSVR: 0.328
LightGBM: 0.147


### 風速

In [32]:
Y_feature = '風速'
model_xcols = {
    'FCN': ['風速', '東西風', '南北風', '溫度'],
    'RandomForest': ['風速', '東西風', '南北風', '晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度', '溫度'],
    'XGBoost': ['風速', '東西風', '南北風', '溫度'],
    'LightGBM': ['風速', '東西風', '南北風', '溫度'],
    'SVR': ['風速', '東西風', '南北風', '溫度'],
    'NuSVR': ['風速', '東西風', '南北風', '溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test, fit_wind_square=True)

Start to tune hyperparameters


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [22:29<00:00, 67.45s/it]


FCN
Best L2_factor = 0.012867906181328041
Best dropout_factor = 0.010144684642082125
Best R2 = 0.3546731054062476


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [07:50<00:00, 23.52s/it]


RandomForest
Best max_depth = 10
Best n_estimators = 85
Best R2 = 0.3786591592946854


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:48<00:00,  5.40s/it]


XGBoost
Best max_depth = 4
Best n_estimators = 149
Best R2 = 0.31422596898636684


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:26<00:00,  4.34s/it]


LightGBM
Best max_depth = 3
Best n_estimators = 21
Best R2 = 0.3483545440894781


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:33<00:00,  4.65s/it]


SVR
Best C = 0.610471421488569
Best kernel = linear
Best R2 = 0.4862990720822355


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [03:35<00:00, 10.77s/it]


NuSVR
Best C = 0.033558469259646846
Best kernel = linear
Best nu = 0.4716071317344285
Best R2 = 0.47560151954235624
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:46<00:00, 12.94s/it]

風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.638626,0.083386,0.741486,0.516752
1,Ensemble,0.659187,0.080599,0.754434,0.538765
2,SVR,0.69879,0.092461,0.811232,0.579664
3,NuSVR,0.699086,0.09288,0.816622,0.572632
4,RandomForest,0.717221,0.089266,0.826204,0.568993
5,XGBoost,0.726111,0.101141,0.831756,0.597543
6,LightGBM,0.800113,0.081724,0.907073,0.672286
7,FCN,0.937772,0.123395,1.089972,0.753825


Weights:
FCN: 0.055
RandomForest: 0.214
XGBoost: 0.239
LightGBM: 0.000
SVR: 0.472
NuSVR: 0.019
 
 
**Copy and Paste following lines into the next cell.**
##### FCN
Best L2_factor = 0.012867906181328041  
Best dropout_factor = 0.010144684642082125  
Best R2 = 0.3546731054062476  
Weight = 0.055
##### RandomForest
Best max_depth = 10  
Best n_estimators = 85  
Best R2 = 0.3786591592946854  
Weight = 0.214
##### XGBoost
Best max_depth = 4  
Best n_estimators = 149  
Best R2 = 0.31422596898636684  
Weight = 0.239
##### LightGBM
Best max_depth = 3  
Best n_estimators = 21  
Best R2 = 0.3483545440894781  
Weight = 0.000
##### SVR
Best C = 0.610471421488569  
Best kernel = linear  
Best R2 = 0.4862990720822355  
Weight = 0.472
##### NuSVR
Best C = 0.033558469259646846  
Best kernel = linear  
Best nu = 0.4716071317344285  
Best R2 = 0.47560151954235624  
Weight = 0.019
 
 


## 預測電力資料

台電官網上可以抓到的歷史電力資料，據我後來觀察，應該是每天用電負載尖峰的那一刻，每個機組的發電功率  
所以我們在這邊的預測標的也會是這個數值

以下所有電力相關數字單位皆為萬瓩，1萬瓩 = 10MW = 10,000,000 W

Note: 這邊是以實時氣象觀測資料預測電力資料，但是這兩組數據基本上是同時得知的  
所以實務上我們是用氣象預報預測第二天的氣象觀測，再用這個預測值預測第二天的電力資料  
因此實際上的預測誤差會比這邊顯示的數值高一點  
實際的預測情形可以到 <a href='http://ec2-54-206-30-159.ap-southeast-2.compute.amazonaws.com:8501'> 這個網站 </a> 查看

### 風力

在我蒐集的資料範圍中，風力發電數值的標準差約為 68萬瓩

In [33]:
Y_feature = '風力'

model_xcols = {
    'LinearRegression': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'FCN': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'RandomForest': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'XGBoost': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'LightGBM': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'SVR': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'NuSVR': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
}

flow_control(Y_feature, model_xcols, weather_power_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,16.025044,1.522812,17.696174,13.77879
1,Ensemble,16.685019,1.57909,18.257676,14.495469
2,SVR,17.446896,1.524793,19.250265,15.569405
3,RandomForest,17.569226,1.635519,19.285915,15.069997
4,NuSVR,17.224148,1.638798,19.500622,15.043274
5,LightGBM,17.76584,1.592495,19.580437,15.648588
6,FCN,18.473377,1.573663,20.312377,16.457719
7,XGBoost,18.585139,1.71906,20.544798,16.497026
8,LinearRegression,27.662117,1.714322,29.360577,25.205529


Weights:
FCN: 0.168
RandomForest: 0.174
XGBoost: 0.042
LightGBM: 0.114
SVR: 0.014
NuSVR: 0.488


### 太陽能


太陽能發電數值的原始標準差約為 260 萬瓩

In [34]:
Y_feature = '太陽能'

model_xcols = {
    'LinearRegression': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'SVR': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'NuSVR': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
}

flow_control(Y_feature, model_xcols, weather_power_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,SVR,75.967777,7.978471,87.152456,64.103092
1,Weighted_Ensemble,77.3543,7.460019,88.348932,65.936388
2,FCN,81.932174,8.845113,93.303322,69.606839
3,Ensemble,83.678832,7.059528,94.344141,73.871629
4,LightGBM,84.685251,7.984838,95.744737,73.955321
5,RandomForest,85.956744,8.844872,96.939551,75.385648
6,XGBoost,89.84949,8.811306,102.494019,78.882599
7,LinearRegression,109.148886,6.876302,116.186085,99.00781
8,NuSVR,108.324865,6.878451,116.968541,97.762416


Weights:
LinearRegression: 0.091
FCN: 0.016
LightGBM: 0.310
SVR: 0.583


### 尖峰負載

尖峰負載的原始標準差約為 410 萬瓩

In [35]:
Y_feature = '尖峰負載'

model_xcols = {
    'LinearRegression': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'SVR': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'NuSVR': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
}

flow_control(Y_feature, model_xcols, weather_power_df, speed_test=speed_test)

Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,51.613468,4.522789,58.05896,45.48096
1,SVR,53.926753,4.81241,60.417246,47.869511
2,Ensemble,56.148145,4.721541,62.963689,50.784719
3,FCN,59.239709,4.939652,65.247551,52.732091
4,NuSVR,59.773023,5.360881,66.0234,52.727752
5,XGBoost,67.961988,5.491587,74.600339,60.275153
6,LightGBM,67.657861,7.114427,74.836064,60.245436
7,RandomForest,67.001229,5.805623,75.217072,60.743785
8,LinearRegression,108.148079,6.405007,116.757913,97.420778


Weights:
LinearRegression: 0.002
FCN: 0.260
XGBoost: 0.206
LightGBM: 0.062
SVR: 0.471


### 夜尖峰

通常台灣全天用電的峰值會發生在下午 1 到 2 點，但是在非工作日或是氣溫較低的時候，有時用電峰值會發生在傍晚 5~7 點左右  
這種狀況之下，台電的歷史資料中太陽能的部分就會變成 0 或者很接近 0，因為取樣時太陽快要或已經下山  
為了處理「夜尖峰」狀況對於太陽能數值預測的影響，我也嘗試預測了夜尖峰的發生與否  
這個問題跟前面預測數值的回歸問題不同，基本上是個分類問題，所以衡量指標變成了 f1-score

In [36]:
Y_feature = '夜尖峰'
model_xcols = {
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'LogisticRegression': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'SVC': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫']
}

flow_control(Y_feature, model_xcols, weather_power_df, mode='classifier', speed_test=speed_test)

Unnamed: 0,Model,Avg F1,Std F1,90th percentile,10th percentile
0,LogisticRegression,0.886865,0.03141,0.930233,0.842105
1,Weighted_Ensemble,0.884288,0.030969,0.926829,0.83871
2,LightGBM,0.883726,0.031469,0.916667,0.837209
3,Ensemble,0.882409,0.030291,0.924731,0.833333
4,SVC,0.87934,0.035119,0.917647,0.831169
5,XGBoost,0.869786,0.032109,0.911111,0.825
6,FCN,0.873834,0.033887,0.911111,0.823529
7,RandomForest,0.821553,0.047912,0.883721,0.756757


Weights:
FCN: 0.199
LogisticRegression: 0.219
RandomForest: 0.176
XGBoost: 0.009
LightGBM: 0.238
SVC: 0.159
