# 超參數及集成學習權重最佳化

我們的目的是要以前一天的氣象預報來預測當天的電力資訊，但是我們手上的氣象預報歷史資料是從2024年七月開始蒐集，直接拿來預測電力資訊天數不夠，  
所以除了 Power_predict.ipynb 裡面敘述的從氣象觀測歷史資料預測電力資料的模型之外，我們還需要建立從氣象預報資料來預測氣象觀測資料的模型。  

這裡的氣象資料預測的模型建立方式跟電力資料的模型大同小異，主要的不同點是氣象資料預測可以把每天每站的數據當成一個樣本，這樣我們就可以在相對短時間之內累積足夠的樣本數。

同時這個筆記本也要處理超參數的最佳化，我們使用 optuna 這個第三方套件來達成這個任務。  
另外我們也嘗試最佳化集成學習時各模型的權重，具體方法為計算出模型之間的誤差相關矩陣，再從這個矩陣解出最佳權重組合。

整個預測系統在真實世界資料中運行的情形可以到<a href='http://ec2-54-206-30-159.ap-southeast-2.compute.amazonaws.com:8501'> 這個網站 </a> 查看

## 初始化

### 匯入模組與套件

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
#這兩行讓 matplotlib 的圖可以顯示中文，同時正常顯示負號
matplotlib.rc('font', family='Microsoft JhengHei')
plt.rcParams['axes.unicode_minus'] = False
import datetime
from copy import deepcopy
import os
import joblib
import json
from tqdm import tqdm
import optuna

# 設置Optuna日誌級別為 WARNING，僅顯示警告及以上級別的信息
optuna.logging.set_verbosity(optuna.logging.WARNING)

pd.set_option('future.no_silent_downcasting', True)

In [2]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR, NuSVR

In [3]:
from sklearn.svm import SVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.optimize import minimize
from sklearn.metrics import f1_score

In [5]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [7]:
from Pytorch_models.metrics import Array_Metrics
from Pytorch_models import models as pytorch_models
from Pytorch_models import api
MAE = Array_Metrics.mae
R2_score = Array_Metrics.r2

In [8]:
from utils.prepare_data import prepare_forecast_observation_df, prepare_data

In [9]:
def FCN_model(input_f, output_f, feature_counts, dropout_factor=0, L2_factor=1e-15, mode='regressor'):
    if mode == 'regressor':
        model = pytorch_models.SimpleNN(input_f, output_f, feature_counts, dropout_factor)
    elif mode == 'classifier':
        model = pytorch_models.SimpleNN_classifier(input_f, output_f, feature_counts, dropout_factor)
    Model_API = api.Model_API(model, L2_factor=L2_factor, classifier=(mode=='classifier'))
    return Model_API

### 初始參數

In [10]:
data_path = './historical/data/'

# 資料的開始與結束日期
start_date = '2023-08-01'
end_date = '2024-09-10'

train_model_path = f'./trained_model_parameters/model_meta_{end_date}/'

In [11]:
# 設定要不要開啟快速測試模式
speed_test = False

#---------------------------------------------------------------------------------
# 設定要不要重算所有超參數與權重，或是不要做任何重算
# 如果兩者皆為 False 則按照下面的個別設定
# 如果兩者皆為 True 則全部重算
#
# Note: 如果指定的路徑當中沒有相應的 meta.json 檔，程式將會無視這邊的設定而進行計算
#---------------------------------------------------------------------------------
rerun_all_calculation = True
dont_run_any_calculation = False


# 此值為 False 則重新計算，True 則從存檔中讀取
optuna_has_done = {
    '日照率': True,
    '最高氣溫': True,
    '最低氣溫': True,
    '氣溫': True,
    '風速': True,
    '風力': False,
    '太陽能': True,
    '尖峰負載': True,
    '夜尖峰': True,
    '午後平均風速': True,
    '午後平均氣溫': True,
    '下午平均風速': True,
    '下午平均氣溫': True,
    '傍晚平均風速': True,
    '傍晚平均氣溫': True,
}

weights_has_determined = {
    '日照率': True,
    '最高氣溫': True,
    '最低氣溫': True,
    '氣溫': True,
    '風速': True,
    '風力': False,
    '太陽能': True,
    '尖峰負載': True,
    '夜尖峰': True,
    '午後平均風速': True,
    '午後平均氣溫': True,
    '下午平均風速': True,
    '下午平均氣溫': True,
    '傍晚平均風速': True,
    '傍晚平均氣溫': True,
}

# 如果前面有設定全部重算或都不要重算，則重設上面的這兩個字典
if dont_run_any_calculation:
    optuna_has_done = {k: True for k in optuna_has_done.keys()}
    weights_has_determined = {k: True for k in weights_has_determined.keys()}

if rerun_all_calculation:
    optuna_has_done = {k: False for k in optuna_has_done.keys()}
    weights_has_determined = {k: False for k in weights_has_determined.keys()}

In [12]:
# 定義每個 model_label 對應的 model
model_class_dict = {}
model_class_dict['regressor'] = {
    'LinearRegression': LinearRegression,
    'RandomForest': RandomForestRegressor,
    'XGBoost': XGBRegressor,
    'LightGBM': LGBMRegressor,
    'SVR': SVR,
    'NuSVR': NuSVR,
    'FCN': FCN_model,
}
model_class_dict['classifier'] = {
    'RandomForest': RandomForestClassifier,
    'XGBoost': XGBClassifier,
    'LightGBM': LGBMClassifier,
    'SVC': SVC,
    'NuSVC': NuSVC,
    'LogisticRegression': LogisticRegression,
    'FCN': FCN_model,
}

### 讀取資料

讀取先前經由爬蟲定時抓取的預報與觀測資料

In [13]:
forecast_obs_df = prepare_forecast_observation_df(data_path, start_date=start_date, end_date=end_date)
weather_power_df = prepare_data(data_path, start_date=start_date, end_date=end_date)

## 函數

### 超參數最佳化

這部分的函數有：  
1. get_XY: 從 DataFrame 中提取需要的 X 與 Y 兩個 numpy array。
2. five_fold_test: 執行一次 5-fold 測試，會呼叫 get_XY_from_forecast_and_observation。
3. assign_model: 根據 model_label 與超參數字典建立一個模型。
3. hyperparameter_tuning: 針對特定的模型與超參數組合，呼叫 five_fold_test 執行多次 5-fold 測試，並回傳 R2 值。
4. optuna_operation: 利用第三方套件 optuna 執行超參數調整，會呼叫 hyperparameter_tuning。

流程控制函數 flow_control 會呼叫 optuna_operation，而主程式只會直接呼叫 flow_control。

In [14]:
def get_XY(data_df, Y_feature, X_features=None, hours=[str(i) for i in range(0, 24, 3)]):
    date_related_cols = ['日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '白日長度']
    
    if Y_feature in ['最高氣溫', '最低氣溫', '氣溫', '風速', '日照率', '全天空日射量']:
        target = 'obs'
    elif '平均' in Y_feature:
        target = 'obs'
    elif Y_feature in ['風力', '太陽能', '尖峰負載', '夜尖峰']:
        target = 'pwd'

    X_cols = []
    if X_features is None:
        for this_col in data_df.columns:
            if '_' in this_col:
                X_cols.append(this_col)
        if target == 'pwd':
            X_cols += date_related_cols
    else:
        for col in data_df.columns:
            if target == 'obs':
                dash_splited = col.split('預報_')
            elif target == 'pwd':
                dash_splited = col.split('_')
            if len(dash_splited) >= 2:
                if dash_splited[0] in X_features and (target=='pwd' or dash_splited[1] in hours):
                    X_cols.append(col)
            else:
                if col in date_related_cols and col in X_features:
                    X_cols.append(col)

    Xs = np.array(data_df[X_cols])
    Ys = np.array(data_df[Y_feature])

    Xs = Xs[np.invert(np.isnan(Ys)),:]
    Ys = Ys[np.invert(np.isnan(Ys))]

    return Xs, Ys, X_cols

In [15]:
def five_fold_test(Xs, Ys, model=XGBRegressor(), mode='regressor',
                   deep_learning=False, fold_n=5, standard_scale=True, always_test_last_chunk=False):
    
    def metric(Y_test, Y_pred, mode=mode):
        if mode == 'regressor':
            return 1 - np.mean((Y_test - Y_pred)**2) / np.var(Y_test)
        elif mode == 'classifier':
            return f1_score(Y_test, Y_pred)

    shuffle = not always_test_last_chunk
    kf = KFold(n_splits=fold_n, shuffle=shuffle)
    
    XY_folds = {}
    for i, (train_index, test_index) in enumerate(kf.split(Xs)):
        XY_folds[i] = (train_index, test_index)
    
    metric_test_list, metric_train_list = [], []

    if always_test_last_chunk:
        iters = [fold_n-1]
    else:
        iters = range(fold_n)
    
    for i in iters:
        if deep_learning:
            input_f = model.model.params['input_f']
            output_f = model.model.params['output_f']
            feature_counts = model.model.params['feature_counts']
            dropout_factor = model.model.params['dropout_factor']
            L2_factor = model.L2_factor
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                              dropout_factor=dropout_factor, L2_factor=L2_factor,mode=mode)
            
        X_train = Xs[XY_folds[i][0]]
        X_test = Xs[XY_folds[i][1]]
        Y_train = Ys[XY_folds[i][0]]
        Y_test = Ys[XY_folds[i][1]]

        if deep_learning:
            X_train_DL, X_val, Y_train_DL, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
    
        if standard_scale:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            if deep_learning:
                X_val = scaler.transform(X_val)
            
        if deep_learning:
            _ = model.fit(X_train_DL, Y_train_DL, X_val, Y_val)
        else:
            _ = model.fit(X_train, Y_train)
    
        Y_pred = model.predict(X_test)
        metric_test_list.append(metric(Y_test, Y_pred))

        Y_pred = model.predict(X_train)
        metric_train_list.append(metric(Y_train, Y_pred))

    metric_test = np.mean(metric_test_list)
    metric_train = np.mean(metric_train_list)
    return metric_test, metric_train

In [16]:
def assign_model(model_label, Xs, cfg, mode):
    
    if model_label == 'LightGBM':
        model = model_class_dict[mode][model_label](force_col_wise=True, verbose=-1, **cfg)
    elif model_label == 'FCN':
        model = model_class_dict[mode][model_label](input_f=Xs.shape[1], output_f=1, feature_counts=[16, 16, 16, 8], mode=mode, **cfg)
    else:
        model = model_class_dict[mode][model_label](**cfg)

    return model

In [17]:
def hyperparameter_tuning(trial, Xs, Ys, mode='regressor',
                          model_label='RandomForest', n_iters=50, always_test_last_chunk=False):

    deep_learning = model_label in ['FCN']
    standard_scale = not deep_learning

    if model_label in ['RandomForest', 'XGBoost', 'LightGBM']:
        cfg = {'max_depth': trial.suggest_int('max_depth', 2, 15),
               'n_estimators': trial.suggest_int('n_estimators', 10, 200)}     
    elif model_label in ['SVR', 'SVC']:
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])}
    elif model_label in ['NuSVR', 'NuSVC']:
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
               'nu': trial.suggest_float('nu', 0.1, 0.9)}
    elif model_label == 'FCN':
        cfg = {'L2_factor': trial.suggest_float('L2_factor', 1e-3, 1, log=True),
               'dropout_factor': trial.suggest_float('dropout_factor', 0, 0.5)}
    elif model_label == 'LogisticRegression':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'])}
    elif model_label == 'LinearRegression':
        cfg = {}

    model = assign_model(model_label, Xs, cfg, mode)
   
    metric_list = []
    iterator = range(n_iters)
    for i in iterator:
        metric, _ = five_fold_test(Xs, Ys, model, mode=mode, 
                                   deep_learning=deep_learning, standard_scale=standard_scale, always_test_last_chunk=always_test_last_chunk)
        metric_list.append(metric)

    return np.mean(metric_list) - np.std(metric_list)

In [18]:
def optuna_operation(model_xcols, Y_feature, data_df, mode='regressor', speed_test=False,
                     optuna_n_trials=20, n_iters=30, always_test_last_chunk=False):

    if mode == 'regressor':
        metric_name = 'R2'
    elif mode == 'classifier':
        metric_name = 'F1'
        
    model_hyperparameters_dict = {}
    model_r2_dict = {}
    
    if always_test_last_chunk:
        n_iters = 1

    model_labels = list(model_xcols.keys())
    
    for model_label in model_labels:
        X_features = model_xcols[model_label]
        Xs, Ys, _ = get_XY(data_df, Y_feature, X_features)

        this_n_iters = n_iters
        this_optuna_n_trials = optuna_n_trials

        if model_label == 'FCN':
            this_n_iters = min(this_n_iters, 1)
            if speed_test:
                this_optuna_n_trials = 4

        if model_label == 'LinearRegression':
            this_optuna_n_trials = 1
            this_n_iters = 10
            
        def target_func(trial, model_label=model_label, Xs=Xs, Ys=Ys, mode=mode,
                        n_iters=this_n_iters, always_test_last_chunk=always_test_last_chunk):
            return hyperparameter_tuning(trial, model_label=model_label, Xs=Xs, Ys=Ys, mode=mode,
                                         n_iters=n_iters, always_test_last_chunk=always_test_last_chunk)
        
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='maximize')
        with tqdm(total=this_optuna_n_trials) as pbar:
            for _ in range(this_optuna_n_trials):
                study.optimize(target_func, n_trials=1, catch=(Exception,))
                pbar.update(1)
        
        print(model_label)
        for key, v in study.best_params.items():
            print(f"Best {key} = {v}")
        print(f"Best {metric_name} = {study.best_value}")
    
        model_hyperparameters_dict[model_label] = study.best_params
        model_r2_dict[model_label] = study.best_value

    return model_hyperparameters_dict, model_r2_dict

### Ensemble

這部分的函數有：
1. cross_correlation_matrix: 由不同模型的預測誤差產生相關矩陣，僅由 get_residual_corr_matrix 呼叫。
2. sovle_optimal_weights: 由誤差相關矩陣解出最佳權重，僅由 find_optimal_weights 呼叫。
3. predict: 訓練模型並取得預測值，僅由 get_residual_corr_matrix 呼叫。
4. get_residual_corr_matrix: 計算並取得所有模型多次取樣的 Y_truth, Y_pred, 與誤差相關矩陣，僅由 find_optimal_weights 呼叫。
5. get_weighted_ensemble_metric: 輸入 get_residual_corr_matrix 的計算結果之後，得出模型評估表格，僅由 find_optimal_weights 呼叫。
6. find_optimal_weights: 統整以上函數，解出最佳權重，並印出模型評估表格。
7. save_model_metadata: 儲存這份筆記本得到的每個被預測值所採用的模型組合，以及每個模型採用的特徵、超參數與權重。

流程控制函數中會呼叫 find_optimal_weights 與 save_model_metadata

In [19]:
def cross_correlation_matrix(residuals):
    N = len(residuals)
    matrix = np.zeros((N, N))
    for i in range(N):
        for j in range(i, N):
            matrix[i][j] = np.mean(np.array(residuals[i]) * np.array(residuals[j]))

    for i in range(1, N):
        for j in range(i):
            matrix[i][j] = matrix[j][i]

    return matrix

In [20]:
def sovle_optimal_weights(matrix):
    N = matrix.shape[0]
    def objective(weights):
        return weights.T @ matrix @ weights

    initial_weights = np.array([1/N] * N)
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * N
    result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
    
    optimal_weights = result.x
    return optimal_weights

In [21]:
def predict(model_label, Y_train, train_ind, test_ind, mode,
            model_hyperparameters_dict, model_xcols, data_df, Y_feature):
    
    X_features = model_xcols[model_label]
    Xs, _, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)

    model = assign_model(model_label, Xs, cfg=model_hyperparameters_dict[model_label], mode=mode)

    deep_learning = False
    if model_label == 'FCN':
        deep_learning = True

    X_train = Xs[train_ind]
    X_test = Xs[test_ind]
    
    if deep_learning:
        X_train_dl, X_val, Y_train_dl, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
        _ = model.fit(X_train_dl, Y_train_dl, X_val, Y_val)
    else:
        scaler = StandardScaler()
        X_scaler = scaler.fit(X_train)
        X_train = X_scaler.transform(X_train)
        X_test = X_scaler.transform(X_test)
        _ = model.fit(X_train, Y_train)
    YP = model.predict(X_test)
    return YP 

In [22]:
def get_residual_corr_matrix(model_hyperparameters_dict, ensemble_models, model_xcols,
                             data_df, Ys, Y_feature, mode,
                             n_iters, n_samples):
    
    def get_prediction_func(model_hyperparameters_dict=model_hyperparameters_dict,
                            model_xcols=model_xcols,
                            data_df=data_df,
                            Y_feature=Y_feature,
                            mode=mode):
        def func(model_label, Y_train, train_ind, test_ind):
            return predict(model_label, Y_train, train_ind, test_ind, mode,
                           model_hyperparameters_dict, model_xcols, data_df, Y_feature)
        return func
        
    get_prediction = get_prediction_func()
    
    Y_pred_iters, Y_test_iters, model_metric = [], [], []
    matrix = np.zeros((len(ensemble_models), len(ensemble_models)))
    for i in tqdm(range(n_iters)):
        train_ind, test_ind, _, _ = train_test_split(np.arange(n_samples), np.arange(n_samples), test_size=0.2)
        
        Y_train = Ys[train_ind]
        Y_test = Ys[test_ind]
        
        Y_preds, this_metric = [], []
        for model_label in ensemble_models:
            YP = get_prediction(model_label, Y_train, train_ind, test_ind)
            if mode == 'regressor':
                this_metric.append(MAE(Y_test, YP))
            elif mode == 'classifier':
                YP[np.where(YP<0.5)] = 0
                YP[np.where(YP>=0.5)] = 1
                this_metric.append(f1_score(Y_test, YP))
            Y_preds.append(YP)
            
        residuals = Y_preds - np.array([Y_test] * len(Y_preds)).reshape(len(Y_preds),-1)
        matrix += cross_correlation_matrix(residuals)

        model_metric.append(this_metric)
        Y_pred_iters.append(Y_preds)
        Y_test_iters.append(Y_test)
    matrix = matrix / n_iters
    return matrix, model_metric, Y_pred_iters, Y_test_iters

In [23]:
def get_weighted_ensemble_metric(Y_pred_iters, Y_test_iters, weights, mode):
    n_iters = len(Y_pred_iters)
    weighted_metric = []
    for i in range(n_iters):
        weighted_YP = np.sum(Y_pred_iters[i] * np.concatenate([weights.reshape(-1,1),] * Y_test_iters[0].shape[0], axis = 1), axis=0)
        if mode == 'regressor':
            weighted_metric.append(MAE(Y_test_iters[i], weighted_YP))
        elif mode == 'classifier':
            weighted_YP[np.where(weighted_YP<0.5)] = 0
            weighted_YP[np.where(weighted_YP>=0.5)] = 1
            weighted_metric.append(f1_score(Y_test_iters[i], weighted_YP))
    weighted_metric = np.array(weighted_metric).reshape(-1, 1)
    return weighted_metric

In [24]:
def find_optimal_weights(model_hyperparameters_dict, model_xcols, 
                         data_df, Y_feature, mode='regressor', 
                         n_iters=200, weights=None):

    if mode == 'regressor':
        metric_name = 'MAE'
    elif mode == 'classifier':
        metric_name = 'F1'       
    
    if weights is None:
        ensemble_models = list(model_hyperparameters_dict.keys())
    else:
        ensemble_models = list(weights.keys())

    n_models = len(ensemble_models)

    X_features = model_xcols[ensemble_models[0]]
    Xs, Ys, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)
    n_samples = Xs.shape[0]

    matrix, model_metric, Y_pred_iters, Y_test_iters = get_residual_corr_matrix(model_hyperparameters_dict=model_hyperparameters_dict,
                                                                                model_xcols=model_xcols, ensemble_models=ensemble_models,
                                                                                data_df=data_df, Ys=Ys, Y_feature=Y_feature, mode=mode,
                                                                                n_iters=n_iters, n_samples=n_samples)
    
    if weights is None:
        optimal_weights = sovle_optimal_weights(matrix)
    else:
        optimal_weights = weights

    uniform_weights = np.array([1/n_models] * n_models)
    uniform_metric = get_weighted_ensemble_metric(Y_pred_iters, Y_test_iters, uniform_weights, mode)
    optimal_metric = get_weighted_ensemble_metric(Y_pred_iters, Y_test_iters, optimal_weights, mode)

    array_metric = np.concatenate([model_metric, uniform_metric, optimal_metric], axis=1)
    
    metric_dict = {
        'Model': ensemble_models + ['Ensemble', 'Weighted_Ensemble'],
        f'Avg {metric_name}': list(np.mean(array_metric, axis=0)), 
        f'Std {metric_name}': list(np.std(array_metric, axis=0)),
        '90th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.9) - 1]),
        '10th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.1) - 1])
        }
    
    df = pd.DataFrame(metric_dict)
    if mode == 'regressor':
        df = df.sort_values('90th percentile').reset_index(drop=True)
    elif mode == 'classifier':
        df = df.sort_values('10th percentile', ascending=False).reset_index(drop=True)

    if weights is not None:
        return df
        
    optimal_weights_dict = {ensemble_models[i]: w for i, w in enumerate(optimal_weights)}
        
    return df, optimal_weights_dict

In [25]:
def save_model_metadata(file_path, model_xcols, model_hyperparameters_dict, optimal_weights):
    model_labels = list(model_hyperparameters_dict)
    output_dict = {
        'X_feature_dict':{},
        'hyperparameters_dict':{},
        'weights':{}
    }
    for model_label in model_labels:
        if optimal_weights[model_label] > 0.0005:
            output_dict['X_feature_dict'][model_label] = model_xcols[model_label]
            output_dict['hyperparameters_dict'][model_label] = model_hyperparameters_dict[model_label]
            output_dict['weights'][model_label] = optimal_weights[model_label]

    with open(file_path, 'w') as f:
        json.dump(output_dict, f)

### 流程控制

主要被主程式呼叫的函數  
負責管理超參數及權重的計算與存取

In [26]:
def flow_control(Y_feature, model_xcols, data_df, mode='regressor', speed_test=False,
                 train_model_path=train_model_path, optuna_has_done=optuna_has_done, weights_has_determined=weights_has_determined, run=False):

    n_iter_dict = {
        'hyper_parameter': 15,
        'ensemble_weight': 200
    }
    if speed_test:
        n_iter_dict = {
            'hyper_parameter': 1,
            'ensemble_weight': 20
        }
    
    this_model_path = f'{train_model_path}{Y_feature}/'
    os.makedirs(this_model_path, exist_ok=True)

    # 如果指定的 meta 檔存在，並且初始參數規定不須重新計算，則套用存檔數值。
    if os.path.exists(f'{train_model_path}{Y_feature}/meta.json') and not run:
        with open(f'{train_model_path}{Y_feature}/meta.json', 'r') as f:
            meta = json.load(f)
    else:
        optuna_has_done[Y_feature] = False
        weights_has_determined[Y_feature] = False

    # 超參數  
    if optuna_has_done[Y_feature]:
        model_xcols = meta['X_feature_dict']
        model_hyperparameters_dict = meta['hyperparameters_dict']
    else: 
        print('Start to tune hyperparameters')
        model_hyperparameters_dict, model_r2_dict = optuna_operation(model_xcols, Y_feature, data_df, mode=mode,
                                                                     n_iters=n_iter_dict['hyper_parameter'],
                                                                     speed_test=speed_test)
    
    # 集成權重
    if weights_has_determined[Y_feature]:
        optimal_weights = meta['weights']
        df = pd.read_csv(f'{this_model_path}predict_MAE.df')
        display(df)
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')
    else:
        print('Start to determine Ensemble weights.')
        if 'FCN' in model_hyperparameters_dict.keys():
            n_iters = int(n_iter_dict['ensemble_weight']/4)
        else:
            n_iters = n_iter_dict['ensemble_weight']
        df, optimal_weights = find_optimal_weights(model_hyperparameters_dict, model_xcols, data_df,
                                                   Y_feature=Y_feature, mode=mode, n_iters=n_iters)
        print(Y_feature)
        display(df)
        df.to_csv(f'{this_model_path}predict_MAE.df', index=False, encoding='utf-8-sig')
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')

    if not (weights_has_determined[Y_feature] and optuna_has_done[Y_feature]):
        print(' ')
        print(' ')
        print('**Copy and Paste following lines into the next cell.**')
        for model_label in model_hyperparameters_dict.keys():
            print('##### ' + model_label)
            for key, v in model_hyperparameters_dict[model_label].items():
                print(f"Best {key} = {v}  ")
            if 'model_r2_dict' in locals().keys():
                print(f"Best R2 = {model_r2_dict[model_label]}  ")
            print(f'Weight = {optimal_weights[model_label]:.3f}')
        print(' ')
        print(' ')
        save_model_metadata(this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

## 預測氣象數值

在先前對於如何用氣象資料預測電力資料的探索，我們發現中央氣象署提供的歷史氣象資料中，以每天的最高氣溫、最低氣溫、平均氣溫、風速與全天空日射量等五個數值比較重要  
同時從中央氣象署網站下載的氣象預報當中，可以找到每個鄉鎮市區每三個小時的天氣狀況、氣溫、風速、風向、相對溼度等資訊  
這邊的天氣狀況就是晴、多雲、短暫陣雨之類的文字敘述，根據我的觀察，這樣的文字敘述可以歸類為七種：  
晴、多雲、陰、短暫陣雨、短暫陣雨或雷雨、午後短暫雷陣雨、陣雨或雷雨
在我的資料表中預先將這七種預報文字進行 one-hot coding 處理

我判斷這樣的文字敘述跟日照率，也就是實測日照時數與天文日照時數之比率比較相關，所以這邊預測的氣象數值鎖定在以下五個值：  
日照率、最高氣溫、最低氣溫、(平均)氣溫、風速  

而預測的樣本單位則是每天每氣象站算是一個樣本，而考慮的氣象站為：  
臺北站、高雄站、嘉義站、東吉島站、臺中電廠站、臺西站等六站

另外在每個預測標的下方的結果表格中，為了瞭解每個模型以及每種集成學習方式分別的預測誤差，以及誤差值的穩定性  
所以我將 MAE 的平均值、標準差、第 10 與第 90 百分位都列出來  
表格是按誤差的第 90 百分位排序的，以觀察各預測方式的穩定性
除了夜尖峰是預測 Yes or No 問題，屬於分類問題而非回歸問題  
所以表格呈現的是 f1-score，也相應的改成按照第 10 百分位排序

表格中的 Ensemble 代表每個模型權重一致的集成學習  
而 Weighted_Ensemble 則代表使用從相關矩陣中解出的最佳權重的集成學習

### 時段風速與氣溫

這邊的時段風速與氣溫，指的是三個時段：午後 (12-15點)、下午 (15-18點)、傍晚(18-21點) 的平均氣溫與風速  
由於用電尖峰通常發生在這三個時段，所以後面的風力發電與夜尖峰的預測會用到這些數據  
氣溫單位為攝氏度，風速單位則為公尺每秒  

In [29]:
for des in ['午後', '下午', '傍晚']:
    model_xcols = {
        'LinearRegression': ['溫度'],
        'SVR': ['溫度'],
        'RandomForest': ['溫度'],
    }
    Y_feature = f'{des}平均氣溫'
    flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

    model_xcols = {
        'LinearRegression': ['風速'],
        'SVR': ['風速'],
        'RandomForest': ['風速'],
    }
    Y_feature = f'{des}平均風速'
    flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)
         

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.99it/s]


LinearRegression
Best R2 = 0.4856681771569115


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:25<00:00,  1.28s/it]


SVR
Best C = 0.14475271112397323
Best kernel = linear
Best R2 = 0.48767119516158086


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:04<00:00,  6.21s/it]


RandomForest
Best max_depth = 2
Best n_estimators = 96
Best R2 = 0.4526509534260649
Start to determine Ensemble weights.


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:13<00:00, 15.04it/s]

午後平均氣溫





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,SVR,1.087336,0.099858,1.204653,0.958545
1,Ensemble,1.088455,0.099405,1.209257,0.963739
2,Weighted_Ensemble,1.089281,0.098667,1.216924,0.958406
3,LinearRegression,1.111165,0.098316,1.237111,0.990754
4,RandomForest,1.134363,0.103326,1.268496,1.001701


Weights:
LinearRegression: 0.428
SVR: 0.396
RandomForest: 0.177
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.4856681771569115  
Weight = 0.428
##### SVR
Best C = 0.14475271112397323  
Best kernel = linear  
Best R2 = 0.48767119516158086  
Weight = 0.396
##### RandomForest
Best max_depth = 2  
Best n_estimators = 96  
Best R2 = 0.4526509534260649  
Weight = 0.177
 
 
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.68it/s]


LinearRegression
Best R2 = 0.28692954395289955


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:26<00:00,  1.34s/it]


SVR
Best C = 0.1404377386980103
Best kernel = linear
Best R2 = 0.3098996529463813


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:19<00:00,  6.95s/it]


RandomForest
Best max_depth = 6
Best n_estimators = 114
Best R2 = 0.22335596277961356
Start to determine Ensemble weights.


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:17<00:00, 11.46it/s]

午後平均風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.957206,0.107783,1.091745,0.824968
1,Ensemble,0.956559,0.107237,1.095268,0.826275
2,SVR,0.956607,0.116653,1.117057,0.815267
3,RandomForest,0.985434,0.117136,1.121597,0.833633
4,LinearRegression,0.996743,0.102707,1.125517,0.863632


Weights:
LinearRegression: 0.330
SVR: 0.240
RandomForest: 0.431
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.28692954395289955  
Weight = 0.330
##### SVR
Best C = 0.1404377386980103  
Best kernel = linear  
Best R2 = 0.3098996529463813  
Weight = 0.240
##### RandomForest
Best max_depth = 6  
Best n_estimators = 114  
Best R2 = 0.22335596277961356  
Weight = 0.431
 
 
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 22.17it/s]


LinearRegression
Best R2 = 0.3059975028820128


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:30<00:00,  1.50s/it]


SVR
Best C = 0.009987591721365137
Best kernel = linear
Best R2 = 0.29289588961890667


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:54<00:00,  5.74s/it]


RandomForest
Best max_depth = 2
Best n_estimators = 99
Best R2 = 0.2860908295924146
Start to determine Ensemble weights.


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:13<00:00, 15.32it/s]

下午平均氣溫





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,SVR,1.317248,0.124565,1.456302,1.145347
1,Weighted_Ensemble,1.328727,0.116336,1.460891,1.168489
2,Ensemble,1.329202,0.117175,1.477507,1.169199
3,LinearRegression,1.345105,0.114694,1.478509,1.197537
4,RandomForest,1.382858,0.118358,1.52541,1.228821


Weights:
LinearRegression: 0.583
SVR: 0.219
RandomForest: 0.198
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.3059975028820128  
Weight = 0.583
##### SVR
Best C = 0.009987591721365137  
Best kernel = linear  
Best R2 = 0.29289588961890667  
Weight = 0.219
##### RandomForest
Best max_depth = 2  
Best n_estimators = 99  
Best R2 = 0.2860908295924146  
Weight = 0.198
 
 
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.90it/s]


LinearRegression
Best R2 = 0.18443384779923216


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.47it/s]


SVR
Best C = 2.0806111557222104
Best kernel = linear
Best R2 = 0.28487280913948104


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:51<00:00,  5.56s/it]


RandomForest
Best max_depth = 2
Best n_estimators = 78
Best R2 = 0.2669861453973298
Start to determine Ensemble weights.


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:11<00:00, 17.81it/s]

下午平均風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.978756,0.113956,1.142485,0.837598
1,Ensemble,0.980616,0.113914,1.146238,0.846208
2,SVR,0.987522,0.121783,1.167617,0.831367
3,LinearRegression,1.031789,0.11455,1.179676,0.890462
4,RandomForest,1.018296,0.129026,1.194967,0.859078


Weights:
LinearRegression: 0.172
SVR: 0.333
RandomForest: 0.494
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.18443384779923216  
Weight = 0.172
##### SVR
Best C = 2.0806111557222104  
Best kernel = linear  
Best R2 = 0.28487280913948104  
Weight = 0.333
##### RandomForest
Best max_depth = 2  
Best n_estimators = 78  
Best R2 = 0.2669861453973298  
Weight = 0.494
 
 
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.14it/s]


LinearRegression
Best R2 = 0.22536370773087872


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:40<00:00,  2.04s/it]


SVR
Best C = 1.2883711096761679
Best kernel = linear
Best R2 = 0.17661158315962003


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:05<00:00,  6.30s/it]


RandomForest
Best max_depth = 3
Best n_estimators = 141
Best R2 = 0.21608294045328683
Start to determine Ensemble weights.


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:19<00:00, 10.08it/s]

傍晚平均氣溫





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Ensemble,1.175232,0.096924,1.296427,1.046049
1,Weighted_Ensemble,1.186663,0.091642,1.297223,1.061384
2,RandomForest,1.201869,0.094439,1.311252,1.076924
3,LinearRegression,1.195502,0.093698,1.313689,1.070394
4,SVR,1.185231,0.108588,1.33132,1.037651


Weights:
LinearRegression: 0.607
SVR: 0.000
RandomForest: 0.393
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.22536370773087872  
Weight = 0.607
##### SVR
Best C = 1.2883711096761679  
Best kernel = linear  
Best R2 = 0.17661158315962003  
Weight = 0.000
##### RandomForest
Best max_depth = 3  
Best n_estimators = 141  
Best R2 = 0.21608294045328683  
Weight = 0.393
 
 
Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.95it/s]


LinearRegression
Best R2 = 0.2475641811226616


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00,  1.14s/it]


SVR
Best C = 0.2541235706156377
Best kernel = linear
Best R2 = 0.39709073342820383


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:11<00:00,  6.57s/it]


RandomForest
Best max_depth = 5
Best n_estimators = 162
Best R2 = 0.30109801234849204
Start to determine Ensemble weights.


100%|████████████████████████████████████████████████████████████████████████████████| 200/200 [00:23<00:00,  8.43it/s]

傍晚平均風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.9385,0.137138,1.094391,0.776757
1,Ensemble,0.943257,0.136629,1.106926,0.777508
2,SVR,0.954928,0.145028,1.132348,0.777832
3,RandomForest,0.979168,0.14187,1.153382,0.814049
4,LinearRegression,0.987318,0.133723,1.158572,0.815457


Weights:
LinearRegression: 0.183
SVR: 0.393
RandomForest: 0.424
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.2475641811226616  
Weight = 0.183
##### SVR
Best C = 0.2541235706156377  
Best kernel = linear  
Best R2 = 0.39709073342820383  
Weight = 0.393
##### RandomForest
Best max_depth = 5  
Best n_estimators = 162  
Best R2 = 0.30109801234849204  
Weight = 0.424
 
 


### 日照率

雖然實際上跟太陽能發電量比較相關的是全天空日射量  
但是在 EDA 環節我們可以看到，日照率乘上天文日射量之後，跟全天空日射量有 r=0.9 左右的極高相關性  
而天文日射量是給定日期就可以確切計算出來的  
所以這邊選擇日照率為預測標的  
單位為百分點  

In [30]:
# 被預測的標的
Y_feature = '日照率'
# 定義集成學習使用的模型以及模型們各自使用的 X 特徵
model_xcols = {
        'RandomForest': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'XGBoost': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'LightGBM': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'SVR': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'NuSVR': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'FCN': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
    }

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Start to tune hyperparameters


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [03:51<00:00, 11.55s/it]


RandomForest
Best max_depth = 11
Best n_estimators = 185
Best R2 = 0.571757864711497


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:36<00:00,  4.83s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 38
Best R2 = 0.5766016833496657


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:23<00:00,  4.16s/it]


LightGBM
Best max_depth = 14
Best n_estimators = 40
Best R2 = 0.5209517010767505


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:21<00:00,  1.09s/it]


SVR
Best C = 6.559776781960609
Best kernel = sigmoid
Best R2 = 0.6092787338561055


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:54<00:00,  2.74s/it]


NuSVR
Best C = 8.523419300615599
Best kernel = sigmoid
Best nu = 0.7008060234343356
Best R2 = 0.6237898280911507


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [15:09<00:00, 45.47s/it]


FCN
Best L2_factor = 0.0026807168682426823
Best dropout_factor = 0.3089577919587767
Best R2 = 0.5843179587259031
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [08:06<00:00,  9.73s/it]

日照率





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Ensemble,13.939838,1.197868,15.341926,12.583453
1,Weighted_Ensemble,13.639628,1.27315,15.430986,11.974807
2,XGBoost,14.050832,1.269437,15.756019,12.116423
3,NuSVR,13.776933,1.269813,15.796761,12.278018
4,SVR,13.956617,1.267728,15.864361,12.542185
5,RandomForest,14.700984,1.338766,16.24509,12.889741
6,LightGBM,15.3332,1.300595,16.876716,13.829395
7,FCN,15.520745,1.545206,17.453591,13.075344


Weights:
RandomForest: 0.000
XGBoost: 0.343
LightGBM: 0.000
SVR: 0.000
NuSVR: 0.657
FCN: 0.000
 
 
**Copy and Paste following lines into the next cell.**
##### RandomForest
Best max_depth = 11  
Best n_estimators = 185  
Best R2 = 0.571757864711497  
Weight = 0.000
##### XGBoost
Best max_depth = 2  
Best n_estimators = 38  
Best R2 = 0.5766016833496657  
Weight = 0.343
##### LightGBM
Best max_depth = 14  
Best n_estimators = 40  
Best R2 = 0.5209517010767505  
Weight = 0.000
##### SVR
Best C = 6.559776781960609  
Best kernel = sigmoid  
Best R2 = 0.6092787338561055  
Weight = 0.000
##### NuSVR
Best C = 8.523419300615599  
Best kernel = sigmoid  
Best nu = 0.7008060234343356  
Best R2 = 0.6237898280911507  
Weight = 0.657
##### FCN
Best L2_factor = 0.0026807168682426823  
Best dropout_factor = 0.3089577919587767  
Best R2 = 0.5843179587259031  
Weight = 0.000
 
 


#### 最佳超參數與權重

##### RandomForest
Best max_depth = 11  
Best n_estimators = 146  
Best R2 = 0.5870948515049399  
Weight = 0.038
##### XGBoost
Best max_depth = 2  
Best n_estimators = 23  
Best R2 = 0.5890760918959569  
Weight = 0.142
##### LightGBM
Best max_depth = 6  
Best n_estimators = 34  
Best R2 = 0.501255556359024  
Weight = 0.000
##### SVR
Best C = 10.510584166812476  
Best kernel = sigmoid  
Best R2 = 0.6175865641668637  
Weight = 0.330
##### NuSVR
Best C = 9.024585024739107  
Best kernel = sigmoid  
Best nu = 0.7512492841501225  
Best R2 = 0.6253677869498243  
Weight = 0.343
##### FCN
Best L2_factor = 0.021517425264349053  
Best dropout_factor = 0.36979837303627927  
Best R2 = 0.5866018727468167  
Weight = 0.147

### 高溫

In [31]:
Y_feature = '最高氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.58it/s]


LinearRegression
Best R2 = 0.5208473903723145


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [21:41<00:00, 65.09s/it]


FCN
Best L2_factor = 0.003939613562228334
Best dropout_factor = 0.13803212221837197
Best R2 = 0.5743030260387081


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:13<00:00,  6.67s/it]


RandomForest
Best max_depth = 5
Best n_estimators = 93
Best R2 = 0.5305792453344224


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:27<00:00,  4.39s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 14
Best R2 = 0.5335713388575904


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:49<00:00,  5.47s/it]


SVR
Best C = 192.51684898769778
Best kernel = linear
Best R2 = 0.5295654310428849


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:42<00:00,  2.12s/it]


NuSVR
Best C = 4.91644015680394
Best kernel = linear
Best nu = 0.38216796028895883
Best R2 = 0.5346219332910935


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:24<00:00,  4.23s/it]


LightGBM
Best max_depth = 14
Best n_estimators = 32
Best R2 = 0.530144108413089
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:37<00:00, 12.76s/it]

最高氣溫





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Ensemble,0.930953,0.08428,1.026577,0.823139
1,Weighted_Ensemble,0.926324,0.083972,1.029584,0.816302
2,SVR,0.954239,0.083162,1.04156,0.824766
3,RandomForest,0.94225,0.086392,1.046302,0.818848
4,NuSVR,0.962395,0.082015,1.051814,0.846132
5,XGBoost,0.956549,0.090398,1.062729,0.821423
6,LinearRegression,0.962666,0.082114,1.063223,0.837421
7,LightGBM,0.956766,0.087682,1.073938,0.834152
8,FCN,0.975322,0.087399,1.088512,0.856804


Weights:
LinearRegression: 0.000
FCN: 0.063
RandomForest: 0.275
XGBoost: 0.000
SVR: 0.114
NuSVR: 0.294
LightGBM: 0.254
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.5208473903723145  
Weight = 0.000
##### FCN
Best L2_factor = 0.003939613562228334  
Best dropout_factor = 0.13803212221837197  
Best R2 = 0.5743030260387081  
Weight = 0.063
##### RandomForest
Best max_depth = 5  
Best n_estimators = 93  
Best R2 = 0.5305792453344224  
Weight = 0.275
##### XGBoost
Best max_depth = 2  
Best n_estimators = 14  
Best R2 = 0.5335713388575904  
Weight = 0.000
##### SVR
Best C = 192.51684898769778  
Best kernel = linear  
Best R2 = 0.5295654310428849  
Weight = 0.114
##### NuSVR
Best C = 4.91644015680394  
Best kernel = linear  
Best nu = 0.38216796028895883  
Best R2 = 0.5346219332910935  
Weight = 0.294
##### LightGBM
Best max_depth = 14  
Best n_estimators = 32  
Best R2 = 0.530144108413089  
Weight = 0.254
 
 


#### 最佳超參數與權重

##### LinearRegression
Best R2 = 0.5534044274964922  
Weight = 0.000
##### FCN
Best L2_factor = 0.0012580849117066048  
Best dropout_factor = 0.17731620058252304  
Best R2 = 0.5839900603099359  
Weight = 0.229
##### RandomForest
Best max_depth = 4  
Best n_estimators = 120  
Best R2 = 0.5409705773728722  
Weight = 0.000
##### XGBoost
Best max_depth = 2  
Best n_estimators = 22  
Best R2 = 0.5428448378319348  
Weight = 0.000
##### SVR
Best C = 1.7157289141337408  
Best kernel = linear  
Best R2 = 0.5448564962294009  
Weight = 0.253
##### NuSVR
Best C = 0.1539035393298039  
Best kernel = linear  
Best nu = 0.44010303108936244  
Best R2 = 0.5592848657357319  
Weight = 0.179
##### LightGBM
Best max_depth = 2  
Best n_estimators = 73  
Best R2 = 0.5435463799154244  
Weight = 0.339

### 低溫

In [32]:
Y_feature = '最低氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.50it/s]


LinearRegression
Best R2 = 0.2154177188638925


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [20:54<00:00, 62.72s/it]


FCN
Best L2_factor = 0.019663323472363207
Best dropout_factor = 0.16052608145482855
Best R2 = 0.25242671218295076


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:05<00:00,  6.26s/it]


RandomForest
Best max_depth = 4
Best n_estimators = 156
Best R2 = 0.24889454702186922


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:34<00:00,  4.73s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 15
Best R2 = 0.24103306292102109


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:58<00:00,  2.91s/it]


SVR
Best C = 0.0728374975766668
Best kernel = linear
Best R2 = 0.20139612214561067


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:35<00:00,  1.79s/it]


NuSVR
Best C = 0.9995438427496642
Best kernel = rbf
Best nu = 0.4900179080149257
Best R2 = 0.22033912881474513


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:17<00:00,  3.85s/it]


LightGBM
Best max_depth = 5
Best n_estimators = 26
Best R2 = 0.24585201132136797
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:23<00:00, 12.46s/it]

最低氣溫





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.89196,0.074301,0.984879,0.805257
1,NuSVR,0.901111,0.077133,0.985382,0.814613
2,XGBoost,0.895446,0.077031,0.985389,0.796593
3,Ensemble,0.89372,0.075778,0.985525,0.807374
4,LightGBM,0.90102,0.071721,0.997967,0.804125
5,RandomForest,0.898036,0.074205,1.001096,0.800107
6,LinearRegression,0.915708,0.080417,1.029123,0.814357
7,SVR,0.912456,0.078684,1.029161,0.821073
8,FCN,0.926109,0.080177,1.038114,0.815247


Weights:
LinearRegression: 0.021
FCN: 0.000
RandomForest: 0.253
XGBoost: 0.257
SVR: 0.156
NuSVR: 0.033
LightGBM: 0.280
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.2154177188638925  
Weight = 0.021
##### FCN
Best L2_factor = 0.019663323472363207  
Best dropout_factor = 0.16052608145482855  
Best R2 = 0.25242671218295076  
Weight = 0.000
##### RandomForest
Best max_depth = 4  
Best n_estimators = 156  
Best R2 = 0.24889454702186922  
Weight = 0.253
##### XGBoost
Best max_depth = 2  
Best n_estimators = 15  
Best R2 = 0.24103306292102109  
Weight = 0.257
##### SVR
Best C = 0.0728374975766668  
Best kernel = linear  
Best R2 = 0.20139612214561067  
Weight = 0.156
##### NuSVR
Best C = 0.9995438427496642  
Best kernel = rbf  
Best nu = 0.4900179080149257  
Best R2 = 0.22033912881474513  
Weight = 0.033
##### LightGBM
Best max_depth = 5  
Best n_estimators = 26  
Best R2 = 0.24585201132136797  
Weight = 0.280
 
 


#### 最佳超參數與權重

##### LinearRegression
Best R2 = 0.14205229333746497  
Weight = 0.000
##### FCN
Best L2_factor = 0.0037237461029340944  
Best dropout_factor = 0.37515051123242515  
Best R2 = 0.1970300802761308  
Weight = 0.000
##### RandomForest
Best max_depth = 5  
Best n_estimators = 77  
Best R2 = 0.25684110582949154  
Weight = 0.797
##### XGBoost
Best max_depth = 2  
Best n_estimators = 10  
Best R2 = 0.22489063953343127  
Weight = 0.000
##### SVR
Best C = 0.11744054294023308  
Best kernel = rbf  
Best R2 = 0.2016527418134913  
Weight = 0.203
##### NuSVR
Best C = 0.8638927850606558  
Best kernel = rbf  
Best nu = 0.42847121164079394  
Best R2 = 0.1889467419868108  
Weight = 0.000
##### LightGBM
Best max_depth = 10  
Best n_estimators = 29  
Best R2 = 0.2075151445577908  
Weight = 0.000

### 平均溫

In [33]:
Y_feature = '氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.44it/s]


LinearRegression
Best R2 = 0.4291458173205749


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [20:48<00:00, 62.44s/it]


FCN
Best L2_factor = 0.0036126748601312346
Best dropout_factor = 0.1817838973041575
Best R2 = 0.43198895824383604


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:04<00:00,  6.22s/it]


RandomForest
Best max_depth = 5
Best n_estimators = 143
Best R2 = 0.4602196826325705


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:34<00:00,  4.74s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 14
Best R2 = 0.4630274272034281


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:07<00:00,  2.61it/s]


SVR
Best C = 0.08008218341041393
Best kernel = linear
Best R2 = 0.4306606781368645


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:42<00:00,  2.13s/it]


NuSVR
Best C = 0.028340029384373234
Best kernel = linear
Best nu = 0.38106739527477745
Best R2 = 0.4413085334182674


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:32<00:00,  4.64s/it]


LightGBM
Best max_depth = 2
Best n_estimators = 142
Best R2 = 0.43573814859162097
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [10:02<00:00, 12.05s/it]

氣溫





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.708176,0.062086,0.777683,0.621395
1,RandomForest,0.712661,0.062474,0.786769,0.642392
2,XGBoost,0.717158,0.062309,0.787743,0.644981
3,Ensemble,0.714811,0.064601,0.789623,0.627709
4,LightGBM,0.724884,0.065953,0.792515,0.646569
5,NuSVR,0.734197,0.066755,0.808168,0.631763
6,LinearRegression,0.73989,0.066586,0.813392,0.62961
7,SVR,0.735758,0.067559,0.818756,0.638055
8,FCN,0.74907,0.065461,0.82523,0.661131


Weights:
LinearRegression: 0.000
FCN: 0.000
RandomForest: 0.484
XGBoost: 0.003
SVR: 0.000
NuSVR: 0.397
LightGBM: 0.115
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.4291458173205749  
Weight = 0.000
##### FCN
Best L2_factor = 0.0036126748601312346  
Best dropout_factor = 0.1817838973041575  
Best R2 = 0.43198895824383604  
Weight = 0.000
##### RandomForest
Best max_depth = 5  
Best n_estimators = 143  
Best R2 = 0.4602196826325705  
Weight = 0.484
##### XGBoost
Best max_depth = 2  
Best n_estimators = 14  
Best R2 = 0.4630274272034281  
Weight = 0.003
##### SVR
Best C = 0.08008218341041393  
Best kernel = linear  
Best R2 = 0.4306606781368645  
Weight = 0.000
##### NuSVR
Best C = 0.028340029384373234  
Best kernel = linear  
Best nu = 0.38106739527477745  
Best R2 = 0.4413085334182674  
Weight = 0.397
##### LightGBM
Best max_depth = 2  
Best n_estimators = 142  
Best R2 = 0.43573814859162097  
Weight = 0.115
 
 


#### 最佳超參數與權重

##### LinearRegression
Best R2 = 0.412985625467958  
Weight = 0.000
##### FCN
Best L2_factor = 0.0010283389751594885  
Best dropout_factor = 0.3059333281801748  
Best R2 = 0.4410662110913644  
Weight = 0.000
##### RandomForest
Best max_depth = 4  
Best n_estimators = 165  
Best R2 = 0.44955960316903565  
Weight = 0.495
##### XGBoost
Best max_depth = 2  
Best n_estimators = 74  
Best R2 = 0.4151441540062713  
Weight = 0.094
##### SVR
Best C = 0.008473094131137855  
Best kernel = linear  
Best R2 = 0.4264848817494777  
Weight = 0.279
##### NuSVR
Best C = 184.29333614114873  
Best kernel = linear  
Best nu = 0.3423328614196417  
Best R2 = 0.4249491691133574  
Weight = 0.131
##### LightGBM
Best max_depth = 10  
Best n_estimators = 25  
Best R2 = 0.4268000484226063  
Weight = 0.002

### 風速

In [34]:
Y_feature = '風速'
model_xcols = {
    'FCN': ['風速', '東西風', '南北風', '溫度'],
    'RandomForest': ['風速', '東西風', '南北風', '晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度', '溫度'],
    'XGBoost': ['風速', '東西風', '南北風', '溫度'],
    'LightGBM': ['風速', '東西風', '南北風', '溫度'],
    'SVR': ['風速', '東西風', '南北風', '溫度'],
    'NuSVR': ['風速', '東西風', '南北風', '溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

Start to tune hyperparameters


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [21:35<00:00, 64.80s/it]


FCN
Best L2_factor = 0.12935373036681566
Best dropout_factor = 0.4128124094325431
Best R2 = 0.5992917762441617


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [06:57<00:00, 20.89s/it]


RandomForest
Best max_depth = 9
Best n_estimators = 167
Best R2 = 0.5159077068868289


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:33<00:00,  4.66s/it]


XGBoost
Best max_depth = 4
Best n_estimators = 122
Best R2 = 0.4505540300462423


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:18<00:00,  3.93s/it]


LightGBM
Best max_depth = 9
Best n_estimators = 19
Best R2 = 0.5063642637143637


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:11<00:00,  3.57s/it]


SVR
Best C = 0.7357754502543328
Best kernel = linear
Best R2 = 0.5655356922928988


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:39<00:00,  4.96s/it]


NuSVR
Best C = 0.21962982174665854
Best kernel = linear
Best nu = 0.7616588175508587
Best R2 = 0.5581192572123792
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [13:06<00:00, 15.73s/it]

風速





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.631981,0.117493,0.75345,0.491402
1,Ensemble,0.637492,0.119968,0.760797,0.482972
2,RandomForest,0.674442,0.117872,0.794064,0.52639
3,SVR,0.682671,0.117353,0.830542,0.546494
4,LightGBM,0.71092,0.119312,0.833085,0.576853
5,XGBoost,0.694569,0.121323,0.844572,0.556417
6,NuSVR,0.687477,0.119034,0.857087,0.541952
7,FCN,0.74184,0.138866,0.935429,0.535835


Weights:
FCN: 0.063
RandomForest: 0.273
XGBoost: 0.214
LightGBM: 0.000
SVR: 0.450
NuSVR: 0.000
 
 
**Copy and Paste following lines into the next cell.**
##### FCN
Best L2_factor = 0.12935373036681566  
Best dropout_factor = 0.4128124094325431  
Best R2 = 0.5992917762441617  
Weight = 0.063
##### RandomForest
Best max_depth = 9  
Best n_estimators = 167  
Best R2 = 0.5159077068868289  
Weight = 0.273
##### XGBoost
Best max_depth = 4  
Best n_estimators = 122  
Best R2 = 0.4505540300462423  
Weight = 0.214
##### LightGBM
Best max_depth = 9  
Best n_estimators = 19  
Best R2 = 0.5063642637143637  
Weight = 0.000
##### SVR
Best C = 0.7357754502543328  
Best kernel = linear  
Best R2 = 0.5655356922928988  
Weight = 0.450
##### NuSVR
Best C = 0.21962982174665854  
Best kernel = linear  
Best nu = 0.7616588175508587  
Best R2 = 0.5581192572123792  
Weight = 0.000
 
 


#### 最佳超參數與權重

##### FCN
Best L2_factor = 0.05592638704213049  
Best dropout_factor = 0.07847548007604593  
Best R2 = 0.5813076958470627  
Weight = 0.167
##### RandomForest
Best max_depth = 11  
Best n_estimators = 42  
Best R2 = 0.5347812133985134  
Weight = 0.536
##### XGBoost
Best max_depth = 2  
Best n_estimators = 129  
Best R2 = 0.4174075328171598  
Weight = 0.000
##### LightGBM
Best max_depth = 9  
Best n_estimators = 14  
Best R2 = 0.5009043458667409  
Weight = 0.000
##### SVR
Best C = 23.778858785551915  
Best kernel = linear  
Best R2 = 0.5609041655735929  
Weight = 0.297
##### NuSVR
Best C = 41.50123732942291  
Best kernel = linear  
Best nu = 0.5195660952664151  
Best R2 = 0.5516527526034817  
Weight = 0.000

## 預測電力資料

台電官網上可以抓到的歷史電力資料，據我後來觀察，應該是每天用電負載尖峰的那一刻，每個機組的發電功率  
所以我們在這邊的預測標的也會是這個數值

以下所有電力相關數字單位皆為萬瓩，1萬瓩 = 10MW = 10,000,000 W

Note: 這邊是以實時氣象觀測資料預測電力資料，但是這兩組數據基本上是同時得知的  
所以實務上我們是用氣象預報預測第二天的氣象觀測，再用這個預測值預測第二天的電力資料  
因此實際上的預測誤差會比這邊顯示的數值高一點  
實際的預測情形可以到 <a href='http://ec2-54-206-30-159.ap-southeast-2.compute.amazonaws.com:8501'> 這個網站 </a> 查看

### 風力

在我蒐集的資料範圍中，風力發電數值的標準差約為 68萬瓩

In [35]:
Y_feature = '風力'

model_xcols = {
    'LinearRegression': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'FCN': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'RandomForest': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'XGBoost': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'LightGBM': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'SVR': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
    'NuSVR': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '午後平均風速', '下午平均風速', '傍晚平均風速'],
}

flow_control(Y_feature, model_xcols, weather_power_df, speed_test=speed_test)

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.40it/s]


LinearRegression
Best R2 = 0.6171993026861705


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [24:42<00:00, 74.12s/it]


FCN
Best L2_factor = 0.008603606007001316
Best dropout_factor = 0.1606367016172888
Best R2 = 0.7731515107318294


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [05:51<00:00, 17.56s/it]


RandomForest
Best max_depth = 14
Best n_estimators = 113
Best R2 = 0.776186397135709


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:08<00:00,  6.40s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 97
Best R2 = 0.7679023222876239


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:10<00:00,  6.53s/it]


LightGBM
Best max_depth = 8
Best n_estimators = 43
Best R2 = 0.7663081909480964


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:16<00:00,  1.21it/s]


SVR
Best C = 2.0158720708721565
Best kernel = sigmoid
Best R2 = 0.6402239147848072


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:22<00:00,  1.14s/it]


NuSVR
Best C = 155.76752701760665
Best kernel = rbf
Best nu = 0.663339231324241
Best R2 = 0.7683295750227099
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [12:47<00:00, 15.34s/it]

風力





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,21.947171,2.050298,24.789188,19.359038
1,NuSVR,23.596613,1.924787,25.525252,21.0113
2,Ensemble,23.78969,2.088476,26.20623,20.907698
3,RandomForest,23.600535,2.242565,26.243521,21.095757
4,XGBoost,24.058891,2.2245,26.57067,21.051337
5,LightGBM,24.167063,2.258272,27.054718,21.372469
6,FCN,24.959707,2.702547,28.785636,21.698703
7,LinearRegression,32.124057,2.344379,34.829219,28.415752
8,SVR,32.594511,2.207381,35.875877,29.565865


Weights:
LinearRegression: 0.000
FCN: 0.144
RandomForest: 0.202
XGBoost: 0.267
LightGBM: 0.000
SVR: 0.000
NuSVR: 0.387
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.6171993026861705  
Weight = 0.000
##### FCN
Best L2_factor = 0.008603606007001316  
Best dropout_factor = 0.1606367016172888  
Best R2 = 0.7731515107318294  
Weight = 0.144
##### RandomForest
Best max_depth = 14  
Best n_estimators = 113  
Best R2 = 0.776186397135709  
Weight = 0.202
##### XGBoost
Best max_depth = 2  
Best n_estimators = 97  
Best R2 = 0.7679023222876239  
Weight = 0.267
##### LightGBM
Best max_depth = 8  
Best n_estimators = 43  
Best R2 = 0.7663081909480964  
Weight = 0.000
##### SVR
Best C = 2.0158720708721565  
Best kernel = sigmoid  
Best R2 = 0.6402239147848072  
Weight = 0.000
##### NuSVR
Best C = 155.76752701760665  
Best kernel = rbf  
Best nu = 0.663339231324241  
Best R2 = 0.7683295750227099  
Weight = 0.387
 
 


#### 最佳超參數與權重

##### LinearRegression
Best R2 = 0.6044580733181938  
Weight = 0.000
##### FCN
Best L2_factor = 0.03002014221111957  
Best dropout_factor = 0.1129475402935946  
Best R2 = 0.7277066975377966  
Weight = 0.062
##### RandomForest
Best max_depth = 7  
Best n_estimators = 188  
Best R2 = 0.739926308023377  
Weight = 0.000
##### XGBoost
Best max_depth = 2  
Best n_estimators = 82  
Best R2 = 0.7433508737111479  
Weight = 0.307
##### LightGBM
Best max_depth = 10  
Best n_estimators = 123  
Best R2 = 0.7397801530188338  
Weight = 0.260
##### SVR
Best C = 43.4883715636823  
Best kernel = rbf  
Best R2 = 0.7397300609763093  
Weight = 0.201
##### NuSVR
Best C = 56.8991555460406  
Best kernel = rbf  
Best nu = 0.42159107656237693  
Best R2 = 0.7347554525194468  
Weight = 0.170

### 太陽能


太陽能發電數值的原始標準差約為 260 萬瓩

In [36]:
Y_feature = '太陽能'

model_xcols = {
    'LinearRegression': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'SVR': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'NuSVR': ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
}

flow_control(Y_feature, model_xcols, weather_power_df, speed_test=speed_test)

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.03it/s]


LinearRegression
Best R2 = 0.6846159903974504


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [24:14<00:00, 72.72s/it]


FCN
Best L2_factor = 0.0015577779362233922
Best dropout_factor = 0.2506061455703462
Best R2 = 0.7336825040857166


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [07:27<00:00, 22.37s/it]


RandomForest
Best max_depth = 12
Best n_estimators = 193
Best R2 = 0.696177487768557


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [03:25<00:00, 10.28s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 100
Best R2 = 0.6769264648868403


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:11<00:00,  6.57s/it]


LightGBM
Best max_depth = 3
Best n_estimators = 69
Best R2 = 0.6695968519976954


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:13<00:00,  1.53it/s]


SVR
Best C = 35.54674592854777
Best kernel = linear
Best R2 = 0.6887863921292163


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:06<00:00,  2.88it/s]


NuSVR
Best C = 2.337682125183769
Best kernel = linear
Best nu = 0.3990382069279208
Best R2 = 0.6754387168561933
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [12:40<00:00, 15.21s/it]

太陽能





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,93.964408,9.989523,108.105628,79.457706
1,Ensemble,99.15715,9.637068,112.176561,85.072779
2,FCN,99.260213,12.406962,113.720331,82.676511
3,RandomForest,99.730024,10.369802,114.708233,87.260225
4,LightGBM,106.107843,10.142168,118.116547,88.95562
5,XGBoost,107.491957,11.023897,123.184077,92.972984
6,SVR,113.940246,9.654999,126.604544,100.79461
7,LinearRegression,113.372975,10.596272,128.038573,97.926668
8,NuSVR,117.002776,9.710519,128.660305,100.643472


Weights:
LinearRegression: 0.148
FCN: 0.387
RandomForest: 0.224
XGBoost: 0.180
LightGBM: 0.061
SVR: 0.000
NuSVR: 0.000
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.6846159903974504  
Weight = 0.148
##### FCN
Best L2_factor = 0.0015577779362233922  
Best dropout_factor = 0.2506061455703462  
Best R2 = 0.7336825040857166  
Weight = 0.387
##### RandomForest
Best max_depth = 12  
Best n_estimators = 193  
Best R2 = 0.696177487768557  
Weight = 0.224
##### XGBoost
Best max_depth = 2  
Best n_estimators = 100  
Best R2 = 0.6769264648868403  
Weight = 0.180
##### LightGBM
Best max_depth = 3  
Best n_estimators = 69  
Best R2 = 0.6695968519976954  
Weight = 0.061
##### SVR
Best C = 35.54674592854777  
Best kernel = linear  
Best R2 = 0.6887863921292163  
Weight = 0.000
##### NuSVR
Best C = 2.337682125183769  
Best kernel = linear  
Best nu = 0.3990382069279208  
Best R2 = 0.6754387168561933  
Weight = 0.000
 
 


#### 最佳超參數與權重

##### LinearRegression
Best R2 = 0.6893254685091934  
Weight = 0.000
##### FCN
Best L2_factor = 0.16993690927931668  
Best dropout_factor = 0.26311208913533524  
Best R2 = 0.7702212180762896  
Weight = 0.286
##### RandomForest
Best max_depth = 12  
Best n_estimators = 134  
Best R2 = 0.7258615014540506  
Weight = 0.000
##### XGBoost
Best max_depth = 3  
Best n_estimators = 80  
Best R2 = 0.7180991262122053  
Weight = 0.110
##### LightGBM
Best max_depth = 5  
Best n_estimators = 56  
Best R2 = 0.6956657014942508  
Weight = 0.096
##### SVR
Best C = 6.694986116951681  
Best kernel = linear  
Best R2 = 0.6946904903081259  
Weight = 0.000
##### NuSVR
Best C = 177.2132188450082  
Best kernel = rbf  
Best nu = 0.895376762270945  
Best R2 = 0.7579144014682827  
Weight = 0.508

### 尖峰負載

尖峰負載的原始標準差約為 410 萬瓩

In [37]:
Y_feature = '尖峰負載'

model_xcols = {
    'LinearRegression': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'SVR': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'NuSVR': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
}

flow_control(Y_feature, model_xcols, weather_power_df, speed_test=speed_test)

Start to tune hyperparameters


100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 19.76it/s]


LinearRegression
Best R2 = 0.8925183491953909


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [25:07<00:00, 75.36s/it]


FCN
Best L2_factor = 0.10425819337184065
Best dropout_factor = 0.0035655506712701512
Best R2 = 0.9568310894068462


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [05:21<00:00, 16.09s/it]


RandomForest
Best max_depth = 8
Best n_estimators = 99
Best R2 = 0.9339271241423087


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [03:09<00:00,  9.49s/it]


XGBoost
Best max_depth = 3
Best n_estimators = 90
Best R2 = 0.9441996765320942


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [03:03<00:00,  9.18s/it]


LightGBM
Best max_depth = 9
Best n_estimators = 164
Best R2 = 0.9060664356971319


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:11<00:00,  1.72it/s]


SVR
Best C = 13.515421532403904
Best kernel = linear
Best R2 = 0.8825791395735105


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:11<00:00,  1.78it/s]


NuSVR
Best C = 195.88226591434008
Best kernel = rbf
Best nu = 0.7047373423919561
Best R2 = 0.9589082462351279
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [12:28<00:00, 14.96s/it]

尖峰負載





Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,52.566298,5.832317,59.407225,46.052171
1,NuSVR,55.96355,7.188452,64.515882,46.065013
2,FCN,59.373526,7.211713,68.306404,49.643449
3,Ensemble,62.949014,6.270494,69.540912,54.722979
4,XGBoost,72.266061,7.345056,82.190014,62.586442
5,RandomForest,74.5254,8.684421,87.289745,64.750575
6,LightGBM,87.886586,9.92412,99.195692,74.183713
7,LinearRegression,109.662984,8.02121,118.353995,99.154899
8,SVR,110.876763,8.435909,121.85058,99.83259


Weights:
LinearRegression: 0.000
FCN: 0.455
RandomForest: 0.000
XGBoost: 0.115
LightGBM: 0.109
SVR: 0.004
NuSVR: 0.317
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.8925183491953909  
Weight = 0.000
##### FCN
Best L2_factor = 0.10425819337184065  
Best dropout_factor = 0.0035655506712701512  
Best R2 = 0.9568310894068462  
Weight = 0.455
##### RandomForest
Best max_depth = 8  
Best n_estimators = 99  
Best R2 = 0.9339271241423087  
Weight = 0.000
##### XGBoost
Best max_depth = 3  
Best n_estimators = 90  
Best R2 = 0.9441996765320942  
Weight = 0.115
##### LightGBM
Best max_depth = 9  
Best n_estimators = 164  
Best R2 = 0.9060664356971319  
Weight = 0.109
##### SVR
Best C = 13.515421532403904  
Best kernel = linear  
Best R2 = 0.8825791395735105  
Weight = 0.004
##### NuSVR
Best C = 195.88226591434008  
Best kernel = rbf  
Best nu = 0.7047373423919561  
Best R2 = 0.9589082462351279  
Weight = 0.317
 
 


#### 最佳超參數與權重

##### LinearRegression
Best R2 = 0.8890289107063775  
Weight = 0.000
##### FCN
Best L2_factor = 0.029438708713891412  
Best dropout_factor = 0.03116915624902694  
Best R2 = 0.9577894981452781  
Weight = 0.549
##### RandomForest
Best max_depth = 13  
Best n_estimators = 184  
Best R2 = 0.9323579004467799  
Weight = 0.013
##### XGBoost
Best max_depth = 2  
Best n_estimators = 104  
Best R2 = 0.9421413919017114  
Weight = 0.202
##### LightGBM
Best max_depth = 7  
Best n_estimators = 195  
Best R2 = 0.902646535842623  
Weight = 0.181
##### SVR
Best C = 37.19208540364481  
Best kernel = linear  
Best R2 = 0.8797513372128146  
Weight = 0.000
##### NuSVR
Best C = 34.75381072717423  
Best kernel = rbf  
Best nu = 0.42062297329040543  
Best R2 = 0.8956020285392687  
Weight = 0.055

### 夜尖峰

通常台灣全天用電的峰值會發生在下午 1 到 2 點，但是在非工作日或是氣溫較低的時候，有時用電峰值會發生在傍晚 5~7 點左右  
這種狀況之下，台電的歷史資料中太陽能的部分就會變成 0 或者很接近 0，因為取樣時太陽快要或已經下山  
為了處理「夜尖峰」狀況對於太陽能數值預測的影響，我也嘗試預測了夜尖峰的發生與否  
這個問題跟前面預測數值的回歸問題不同，基本上是個分類問題，所以衡量指標變成了 f1-score

In [38]:
Y_feature = '夜尖峰'
model_xcols = {
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'LogisticRegression': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫'],
    'SVC': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度', '午後平均氣溫', '下午平均氣溫', '傍晚平均氣溫']
}

flow_control(Y_feature, model_xcols, weather_power_df, mode='classifier', speed_test=speed_test)

Start to tune hyperparameters


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [26:27<00:00, 79.37s/it]


FCN
Best L2_factor = 0.03377078593016343
Best dropout_factor = 0.31125954161240865
Best F1 = 0.8644128449006498


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:15<00:00,  1.28it/s]


LogisticRegression
Best C = 6.509226859076185
Best solver = saga
Best F1 = 0.8509791126002184


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [03:30<00:00, 10.53s/it]


RandomForest
Best max_depth = 14
Best n_estimators = 200
Best F1 = 0.7834369628549726


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:31<00:00,  4.60s/it]


XGBoost
Best max_depth = 5
Best n_estimators = 11
Best F1 = 0.8376364543947514


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [02:07<00:00,  6.38s/it]


LightGBM
Best max_depth = 10
Best n_estimators = 44
Best F1 = 0.7922909461600137


100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:08<00:00,  2.24it/s]


SVC
Best C = 9.80782542344959
Best kernel = rbf
Best F1 = 0.8535587363934832
Start to determine Ensemble weights.


100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [15:06<00:00, 18.13s/it]

夜尖峰





Unnamed: 0,Model,Avg F1,Std F1,90th percentile,10th percentile
0,Ensemble,0.877525,0.046048,0.93617,0.810811
1,Weighted_Ensemble,0.861186,0.050889,0.931034,0.789474
2,FCN,0.85089,0.056281,0.915254,0.784314
3,XGBoost,0.851554,0.054796,0.916667,0.780488
4,LogisticRegression,0.860657,0.057658,0.9375,0.77551
5,SVC,0.859601,0.056362,0.933333,0.774194
6,LightGBM,0.804717,0.053897,0.878049,0.733333
7,RandomForest,0.787811,0.067726,0.857143,0.689655


Weights:
FCN: 0.220
LogisticRegression: 0.267
RandomForest: 0.146
XGBoost: 0.186
LightGBM: 0.043
SVC: 0.138
 
 
**Copy and Paste following lines into the next cell.**
##### FCN
Best L2_factor = 0.03377078593016343  
Best dropout_factor = 0.31125954161240865  
Best R2 = 0.8644128449006498  
Weight = 0.220
##### LogisticRegression
Best C = 6.509226859076185  
Best solver = saga  
Best R2 = 0.8509791126002184  
Weight = 0.267
##### RandomForest
Best max_depth = 14  
Best n_estimators = 200  
Best R2 = 0.7834369628549726  
Weight = 0.146
##### XGBoost
Best max_depth = 5  
Best n_estimators = 11  
Best R2 = 0.8376364543947514  
Weight = 0.186
##### LightGBM
Best max_depth = 10  
Best n_estimators = 44  
Best R2 = 0.7922909461600137  
Weight = 0.043
##### SVC
Best C = 9.80782542344959  
Best kernel = rbf  
Best R2 = 0.8535587363934832  
Weight = 0.138
 
 


#### 最佳超參數與權重

##### FCN
Best L2_factor = 0.002576271940664411  
Best dropout_factor = 0.1252443332428825  
Best R2 = 0.8748909576667637  
Weight = 0.153
##### LogisticRegression
Best C = 0.7975026697912178  
Best solver = sag  
Best R2 = 0.8541996823354088  
Weight = 0.331
##### RandomForest
Best max_depth = 14  
Best n_estimators = 95  
Best R2 = 0.8186294971282218  
Weight = 0.208
##### XGBoost
Best max_depth = 15  
Best n_estimators = 33  
Best R2 = 0.8624582788638403  
Weight = 0.291
##### LightGBM
Best max_depth = 2  
Best n_estimators = 56  
Best R2 = 0.8285599799076798  
Weight = 0.000
##### SVC
Best C = 1.2964550960648644  
Best kernel = rbf  
Best R2 = 0.8649222578257927  
Weight = 0.016