# 從氣象署預報預測真實氣象觀測數值

這個筆記本包含了模型第一部分的建立  
以及超參數的調整與集成學習的權重決定

## 初始化

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
#這兩行讓 matplotlib 的圖可以顯示中文，同時正常顯示負號
matplotlib.rc('font', family='Microsoft JhengHei')
plt.rcParams['axes.unicode_minus'] = False
import datetime
from copy import deepcopy
import os
import joblib
import json
from tqdm import tqdm
import optuna

# 設置Optuna日誌級別為 WARNING，僅顯示警告及以上級別的信息
optuna.logging.set_verbosity(optuna.logging.WARNING)

pd.set_option('future.no_silent_downcasting', True)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR, NuSVR

from scipy.optimize import minimize

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

In [4]:
from Pytorch_models.metrics import Array_Metrics
from Pytorch_models import models as pytorch_models
from Pytorch_models import api
MAE = Array_Metrics.mae
R2_score = Array_Metrics.r2

In [5]:
from utils.prepare_data import prepare_forecast_observation_df, prepare_data

In [6]:
speed_test = True

start_date = '2023-08-01'
end_date = '2025-01-01'

# 此值為 False 則重新計算，True 則從存檔中讀取
optuna_done = {
    '日照率': False,
    '最高氣溫': False,
    '最低氣溫': False,
    '氣溫': False,
    '風速': False,
}

weights_determined = {
    '日照率': False,
    '最高氣溫': False,
    '最低氣溫': False,
    '氣溫': False,
    '風速': False,
}

## 讀取並整理資料

讀取先前經由爬蟲定時抓取的預報與觀測資料，並調整格式

In [7]:
data_path = './historical/data/'
train_model_path = './trained_model_parameters/model_meta_2024-09-03/'

In [8]:
forecast_obs_df = prepare_forecast_observation_df(data_path, start_date=start_date, end_date=end_date)
weather_power_df = prepare_data(data_path, start_date=start_date, end_date=end_date)
weather_power_df['夜尖峰'] = [0 if se > 20 else 1 for se in weather_power_df['太陽能']]

## 函數

### FCN model API

In [9]:
def FCN_model(input_f, output_f, feature_counts, dropout_factor=0, L2_factor=1e-15, mode='regressor'):
    if mode == 'regressor':
        model = pytorch_models.SimpleNN(input_f, output_f, feature_counts, dropout_factor)
    elif mode == 'classifier':
        model = pytorch_models.SimpleNN_classifer(input_f, output_f, feature_counts, dropout_factor)
    Model_API = api.Model_API(model, L2_factor=L2_factor, classifer=(mode=='classifier'))
    return Model_API

### Hyper Parameter Tuning

這部分的函數有：  
1. get_XY_from_forecast_and_observation: 從 DataFrame 中提取需要的 X 與 Y 兩個 numpy array。
2. five_fold_test: 執行一次 5-fold 測試，會呼叫 get_XY_from_forecast_and_observation。
3. hyperparameter_tuning: 針對特定的模型與超參數組合，呼叫 five_fold_test 執行多次 5-fold 測試，並回傳 R2 值。
4. optuna_operation: 利用第三方套件 optuna 執行超參數調整，會呼叫 hyperparameter_tuning。

流程控制函數 flow_control 會呼叫 optuna_operation，而主程式只會直接呼叫 flow_control。

In [10]:
def get_XY(data_df, Y_feature, X_features=None):
    date_related_cols = ['日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '白日長度']
    
    if Y_feature in ['最高氣溫', '最低氣溫', '氣溫', '風速', '日照率', '全天空日射量']:
        target = 'obs'
    elif Y_feature in ['風力', '太陽能', '尖峰負載', '夜尖峰']:
        target = 'pwd'

    X_cols = []
    if X_features is None:
        for this_col in data_df.columns:
            if '_' in this_col:
                X_cols.append(this_col)
        if target == 'pwd':
            X_cols += date_related_cols
    else:
        for col in data_df.columns:
            if target == 'obs':
                dash_splited = col.split('預報_')
            elif target == 'pwd':
                dash_splited = col.split('_')
            if len(dash_splited) >= 2:
                if dash_splited[0] in X_features:
                    X_cols.append(col)
            else:
                if col in date_related_cols and col in X_features:
                    X_cols.append(col)

    Xs = np.array(data_df[X_cols])
    Ys = np.array(data_df[Y_feature])

    Xs = Xs[np.invert(np.isnan(Ys)),:]
    Ys = Ys[np.invert(np.isnan(Ys))]

    return Xs, Ys, X_cols

In [11]:
def five_fold_test(Xs, Ys, model=XGBRegressor(), mode='regressor',
                   deep_learning=False, fold_n=5, standard_scale=True, always_test_last_chunk=False):
    
    def metric(Y_test, Y_pred, mode=mode):
        if mode == 'regressor':
            return 1 - np.mean((Y_test - Y_pred)**2) / np.var(Y_test)
        elif mode == 'classifier':
            return f1_score(Y_test, Y_pred)

    shuffle = not always_test_last_chunk
    kf = KFold(n_splits=fold_n, shuffle=shuffle)
    
    XY_folds = {}
    for i, (train_index, test_index) in enumerate(kf.split(Xs)):
        XY_folds[i] = (train_index, test_index)
    
    metric_test_list = []
    metric_train_list = []

    if always_test_last_chunk:
        iters = [fold_n-1]
    else:
        iters = range(fold_n)
    
    for i in iters:
        if deep_learning:
            input_f = model.model.params['input_f']
            output_f = model.model.params['output_f']
            feature_counts = model.model.params['feature_counts']
            dropout_factor = model.model.params['dropout_factor']
            L2_factor = model.L2_factor
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                              dropout_factor=dropout_factor, L2_factor=L2_factor,mode=mode)
            
        X_train = Xs[XY_folds[i][0]]
        X_test = Xs[XY_folds[i][1]]
        Y_train = Ys[XY_folds[i][0]]
        Y_test = Ys[XY_folds[i][1]]

        if deep_learning:
            X_train_DL, X_val, Y_train_DL, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
    
        if standard_scale:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            if deep_learning:
                X_val = scaler.transform(X_val)
            
        if deep_learning:
            _ = model.fit(X_train_DL, Y_train_DL, X_val, Y_val)
        else:
            _ = model.fit(X_train, Y_train)
    
        Y_pred = model.predict(X_test)
        metric_test_list.append(metric(Y_test, Y_pred))

        Y_pred = model.predict(X_train)
        metric_train_list.append(metric(Y_train, Y_pred))

    metric_test = np.mean(metric_test_list)
    metric_train = np.mean(metric_train_list)
    return metric_test, metric_train

In [12]:
def hyperparameter_tuning(trial, Xs, Ys, model_label='RandomForest', n_iters=50, always_test_last_chunk=False):
    deep_learning = False
    standard_scale = True
    if model_label in ['RandomForest', 'XGBoost', 'LightGBM']:
        cfg = {'max_depth': trial.suggest_int('max_depth', 2, 15),
               'n_estimators': trial.suggest_int('n_estimators', 10, 200)}
        max_depth = cfg['max_depth']
        n_estimators = cfg['n_estimators']
    
        if model_label == 'RandomForest':
            model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)
        elif model_label == 'XGBoost':
            model = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators)
        elif model_label == 'LightGBM':
            model = LGBMRegressor(force_col_wise=True, verbose=-1, max_depth=max_depth, n_estimators=n_estimators)
    elif model_label == 'SVR':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])}
        C = cfg['C']
        kernel = cfg['kernel']
        model = SVR(C=C, kernel=kernel)
    elif model_label == 'NuSVR':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
               'nu': trial.suggest_float('nu', 0.1, 0.9)}
        C = cfg['C']
        kernel = cfg['kernel']
        nu = cfg['nu']
        model = NuSVR(C=C, kernel=kernel, nu=nu)
    elif model_label == 'FCN':
        deep_learning = True
        standard_scale = False
        cfg = {'L2_factor': trial.suggest_float('L2_factor', 1e-3, 1, log=True),
               'dropout_factor': trial.suggest_float('dropout_factor', 0, 0.5)}
        L2_factor = cfg['L2_factor']
        dropout_factor = cfg['dropout_factor']
        input_f = Xs.shape[1] 
        output_f = 1 
        feature_counts = [16, 16, 16, 8]
        model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                          dropout_factor=dropout_factor, L2_factor=L2_factor)
    elif model_label == 'LinearRegression':
        model = LinearRegression()
        
    R2_list = []
    iterator = range(n_iters)
    for i in iterator:
        R2, _ = five_fold_test(Xs, Ys, model, deep_learning=deep_learning, standard_scale=standard_scale, always_test_last_chunk=always_test_last_chunk)
        R2_list.append(R2)

    return np.mean(R2_list) - np.std(R2_list)

In [13]:
def optuna_operation(model_xcols, Y_feature, data_df, speed_test=False,
                     optuna_n_trials=30, n_iters=30, always_test_last_chunk=False):
    model_hyperparameters_dict = {}
    model_r2_dict = {}
    
    if always_test_last_chunk:
        n_iters = 1

    model_labels = list(model_xcols.keys())
    
    for model_label in model_labels:
        X_features = model_xcols[model_label]
        Xs, Ys, _ = get_XY(data_df, Y_feature, X_features)

        this_n_iters = n_iters
        this_optuna_n_trials = optuna_n_trials

        if model_label == 'FCN':
            this_n_iters = min(this_n_iters, 1)
            if speed_test:
                this_optuna_n_trials = 4

        if model_label == 'LinearRegression':
            this_optuna_n_trials = 1
            this_n_iters = 10
            
        def target_func(trial, model_label=model_label, Xs=Xs, Ys=Ys, n_iters=this_n_iters, always_test_last_chunk=always_test_last_chunk):
            return hyperparameter_tuning(trial, model_label=model_label, Xs=Xs, Ys=Ys, n_iters=n_iters, always_test_last_chunk=always_test_last_chunk)
        
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='maximize')
        with tqdm(total=this_optuna_n_trials) as pbar:
            for _ in range(this_optuna_n_trials):
                study.optimize(target_func, n_trials=1, catch=(Exception,))
                pbar.update(1)
        
        print(model_label)
        for key, v in study.best_params.items():
            print(f"Best {key} = {v}")
        print(f"Best R2 = {study.best_value}")
    
        model_hyperparameters_dict[model_label] = study.best_params
        model_r2_dict[model_label] = study.best_value

    return model_hyperparameters_dict, model_r2_dict

### Ensemble

這部分的函數有：
1. cross_correlation_matrix: 由不同模型的預測誤差產生相關矩陣。
2. sovle_optimal_weights: 由誤差相關矩陣解出最佳權重。
3. assign_model: 由 model_label 與 model_hyperparameters_dict 產生一個模型 instance。
4. find_avg_score_with_given_model_list: 算出 N 組模型預測誤差樣本，再從中解出最佳權重，並提供不同模型的平均誤差。
5. save_model_metadata: 儲存這份筆記本得到的每個被預測值所採用的模型組合，以及每個模型採用的特徵、超參數與權重。

流程控制函數中會呼叫 find_avg_score_with_given_model_list 與 save_model_metadata

In [14]:
def cross_correlation_matrix(residuals):
    N = len(residuals)
    matrix = np.zeros((N, N))
    for i in range(N):
        for j in range(i, N):
            matrix[i][j] = np.mean(np.array(residuals[i]) * np.array(residuals[j]))

    for i in range(1, N):
        for j in range(i):
            matrix[i][j] = matrix[j][i]

    return matrix

In [15]:
def sovle_optimal_weights(matrix):
    N = matrix.shape[0]
    def objective(weights):
        return weights.T @ matrix @ weights

    initial_weights = np.array([1/N] * N)
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * N
    result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
    
    optimal_weights = result.x
    return optimal_weights

In [16]:
def assign_model(model_label, Xs, model_hyperparameters_dict, mode='regressor'):
    if mode == 'regressor':
        if model_label == 'LinearRegression':
            model = LinearRegression()
        elif model_label == 'RandomForest':
            model = RandomForestRegressor(**model_hyperparameters_dict[model_label])
        elif model_label == 'XGBoost':
            model = XGBRegressor(**model_hyperparameters_dict[model_label])
        elif model_label == 'LightGBM':
            model = LGBMRegressor(force_col_wise=True, verbose=-1, **model_hyperparameters_dict[model_label])
        elif model_label == 'SVR':
            model = SVR(**model_hyperparameters_dict[model_label])
        elif model_label == 'NuSVR':
            model = NuSVR(**model_hyperparameters_dict[model_label])
        elif model_label == 'FCN':
            input_f = Xs.shape[1]
            output_f = 1
            feature_counts = [16, 16, 16, 8]
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                      **model_hyperparameters_dict[model_label])
    elif mode == 'classifier':
        if model_label == 'FCN':
            input_f = Xs.shape[1]
            output_f = 1
            feature_counts = [16, 16, 16, 8]
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                                mode='classifier', 
                                **model_hyperparameters_dict[model_label])
        elif model_label == 'RandomForest':
            model = RandomForestClassifier(**model_hyperparameters_dict[model_label])
        elif model_label == 'XGBoost':
            model = XGBClassifier(**model_hyperparameters_dict[model_label])
        elif model_label == 'LightGBM':
            model = LGBMClassifier(force_col_wise=True, verbose=-1, **model_hyperparameters_dict[model_label])
        elif model_label == 'SVC':
            model = SVC(**model_hyperparameters_dict[model_label])
        elif model_label == 'NuSVC':
            model = NuSVC(**model_hyperparameters_dict[model_label])
        elif model_label == 'LogisticRegression':
            model = LogisticRegression(**model_hyperparameters_dict[model_label])
    return model

In [17]:
def find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols,
                                         data_df, Y_feature, mode='regressor',
                                         n_iters=200, weights=None):

    if mode == 'regressor':
        metric_name = 'MAE'
    elif mode == 'classifier':
        metric_name = 'F1'
    
    def get_prediction(model_label, Y_train, train_ind, test_ind,
                       model_hyperparameters_dict=model_hyperparameters_dict,
                       model_xcols=model_xcols,
                       data_df=data_df, 
                       Y_feature=Y_feature):
        
        X_features = model_xcols[model_label]
        Xs, _, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)

        model = assign_model(model_label, Xs, model_hyperparameters_dict=model_hyperparameters_dict, mode=mode)

        deep_learning = False
        if model_label == 'FCN':
            deep_learning = True
    
        X_train = Xs[train_ind]
        X_test = Xs[test_ind]
        
        if deep_learning:
            X_train_dl, X_val, Y_train_dl, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
            _ = model.fit(X_train_dl, Y_train_dl, X_val, Y_val)
        else:
            scaler = StandardScaler()
            X_scaler = scaler.fit(X_train)
            X_train = X_scaler.transform(X_train)
            X_test = X_scaler.transform(X_test)
            _ = model.fit(X_train, Y_train)
        YP = model.predict(X_test)
        return YP        
    
    if weights is None:
        ensemble_models = list(model_hyperparameters_dict.keys())
    else:
        ensemble_models = list(weights.keys())
        
    Y_pred_iters = []
    Y_test_iters = []
    metric = []

    X_features=model_xcols[ensemble_models[0]]
    Xs, Ys, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)

    matrix = np.zeros((len(ensemble_models), len(ensemble_models)))
    for i in tqdm(range(n_iters)):
        train_ind, test_ind, _, _ = train_test_split(np.arange(Xs.shape[0]), np.arange(Xs.shape[0]), test_size=0.20)
        
        Y_train = Ys[train_ind]
        Y_test = Ys[test_ind]
        
        Y_preds = []
        this_metric = []
        for model_label in ensemble_models:
            YP = get_prediction(model_label, Y_train, train_ind, test_ind)
            if mode == 'regressor':
                this_metric.append(MAE(Y_test, YP))
            elif mode == 'classifier':
                YP[np.where(YP<0.5)] = 0
                YP[np.where(YP>=0.5)] = 1
                this_metric.append(f1_score(Y_test, YP))
            Y_preds.append(YP)
            
        residuals = Y_preds - np.array([Y_test] * len(Y_preds)).reshape(len(Y_preds),-1)
        if weights is None:
            matrix += cross_correlation_matrix(residuals)

        uniform_ensemble_YP = np.mean(Y_preds, axis=0)
        if mode == 'regressor':
            this_metric.append(MAE(Y_test, uniform_ensemble_YP))
        elif mode == 'classifier':
            uniform_ensemble_YP[np.where(uniform_ensemble_YP<0.5)] = 0
            uniform_ensemble_YP[np.where(uniform_ensemble_YP>=0.5)] = 1
            this_metric.append(f1_score(Y_test, uniform_ensemble_YP))

        metric.append(this_metric)
        Y_pred_iters.append(Y_preds)
        Y_test_iters.append(Y_test)

    if weights is None:
        matrix = matrix / n_iters
        optimal_weights = sovle_optimal_weights(matrix)
    else:
        optimal_weights = weights

    weighted_metric = []
    for i in range(n_iters):
        weighted_YP = np.sum(Y_pred_iters[i] * np.concatenate([optimal_weights.reshape(-1,1),] * Y_test_iters[0].shape[0], axis = 1), axis=0)
        if mode == 'regressor':
            weighted_metric.append(MAE(Y_test_iters[i], weighted_YP))
        elif mode == 'classifier':
            weighted_YP[np.where(weighted_YP<0.5)] = 0
            weighted_YP[np.where(weighted_YP>=0.5)] = 1
            weighted_metric.append(f1_score(Y_test_iters[i], weighted_YP))
    weighted_metric = np.array(weighted_metric).reshape(-1, 1)
    array_metric = np.array(metric)
    array_metric = np.concatenate([metric, weighted_metric], axis=1)
    
    metric_dict = {
        'Model': ensemble_models + ['Ensemble', 'Weighted_Ensemble'],
        f'Avg {metric_name}': list(np.mean(array_metric, axis=0)), 
        f'Std {metric_name}': list(np.std(array_metric, axis=0)),
        '90th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.9) - 1]),
        '10th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.1) - 1])
        }
    
    df = pd.DataFrame(metric_dict)
    if mode == 'regressor':
        df = df.sort_values('90th percentile').reset_index(drop=True)
    elif mode == 'classifier':
        df = df.sort_values('10th percentile').reset_index(drop=True)

    if weights is not None:
        return df
        
    optimal_weights_dict = {}
    for i, w in enumerate(optimal_weights):
        optimal_weights_dict[ensemble_models[i]] = w
        
    return df, optimal_weights_dict

In [18]:
def save_model_metadata(file_path, model_xcols, model_hyperparameters_dict, optimal_weights):
    model_labels = list(model_hyperparameters_dict)
    output_dict = {
        'X_feature_dict':{},
        'hyperparameters_dict':{},
        'weights':{}
    }
    for model_label in model_labels:
        if optimal_weights[model_label] > 0.0005:
            output_dict['X_feature_dict'][model_label] = model_xcols[model_label]
            output_dict['hyperparameters_dict'][model_label] = model_hyperparameters_dict[model_label]
            output_dict['weights'][model_label] = optimal_weights[model_label]

    with open(file_path, 'w') as f:
        json.dump(output_dict, f)

### 流程控制

In [19]:
def flow_control(Y_feature, model_xcols, data_df, speed_test=False,
                 train_model_path=train_model_path, optuna_done=optuna_done, weights_determined=weights_determined):

    n_iter_dict = {
        'hyper_parameter': 30,
        'ensemble_weight': 200
    }
    if speed_test:
        n_iter_dict = {
            'hyper_parameter': 1,
            'ensemble_weight': 20
        }
    
    this_model_path = f'{train_model_path}{Y_feature}/'
    os.makedirs(this_model_path, exist_ok=True)

    # 如果指定的 meta 檔存在，並且初始參數規定不須重新計算，則套用存檔數值。
    if os.path.exists(f'{train_model_path}{Y_feature}/meta.json'):
        with open(f'{train_model_path}{Y_feature}/meta.json', 'r') as f:
            meta = json.load(f)
    else:
        optuna_done[Y_feature] = False
        weights_determined[Y_feature] = False

    # 超參數
    if optuna_done[Y_feature]:
        model_xcols = meta['X_feature_dict']
        model_hyperparameters_dict = meta['hyperparameters_dict']
    else: 
        model_hyperparameters_dict, model_r2_dict = optuna_operation(model_xcols, Y_feature, data_df,
                                                                     n_iters=n_iter_dict['hyper_parameter'], speed_test=speed_test)

    # 集成權重
    if weights_determined[Y_feature]:
        optimal_weights = meta['weights']
        df = pd.read_csv(f'{this_model_path}predict_MAE.df')
        display(df)
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')
    else:
        if 'FCN' in model_hyperparameters_dict.keys():
            n_iters = int(n_iter_dict['ensemble_weight']/4)
        else:
            n_iters = n_iter_dict['ensemble_weight']
        df, optimal_weights = find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols, data_df, Y_feature, n_iters=n_iters)
        display(df)
        df.to_csv(f'{this_model_path}predict_MAE.df', index=False, encoding='utf-8-sig')
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')

    print(' ')
    print(' ')
    print('**Copy and Paste following lines into the next cell.**')
    for model_label in model_hyperparameters_dict.keys():
        print('##### ' + model_label)
        for key, v in model_hyperparameters_dict[model_label].items():
            print(f"Best {key} = {v}  ")
        if 'model_r2_dict' in locals().keys():
            print(f"Best R2 = {model_r2_dict[model_label]}  ")
        print(f'Weight = {optimal_weights[model_label]:.3f}')
    print(' ')
    print(' ')
    
    save_model_metadata(this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

## 日照率

In [20]:
# 被預測的標的
Y_feature = '日照率'
# 定義集成學習使用的模型以及模型們各自使用的 X 特徵
model_xcols = {
        'RandomForest': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'XGBoost': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'LightGBM': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'SVR': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'NuSVR': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
        'FCN': ['晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度'],
    }

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:13<00:00,  2.15it/s]


RandomForest
Best max_depth = 5
Best n_estimators = 58
Best R2 = 0.6256715903550362


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:15<00:00,  1.98it/s]


XGBoost
Best max_depth = 4
Best n_estimators = 189
Best R2 = 0.5877245861952668


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:07<00:00,  3.97it/s]


LightGBM
Best max_depth = 4
Best n_estimators = 67
Best R2 = 0.5674625412883484


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 33.97it/s]


SVR
Best C = 11.518378182409693
Best kernel = sigmoid
Best R2 = 0.6566459940695035


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 48.91it/s]


NuSVR
Best C = 0.570362585613368
Best kernel = linear
Best nu = 0.6108608664976618
Best R2 = 0.6797318701856108


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:12<00:00, 48.10s/it]


FCN
Best L2_factor = 0.02578546166740118
Best dropout_factor = 0.34243967658484603
Best R2 = 0.6010095954700934


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:46<00:00,  9.31s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,RandomForest,13.236224,0.815073,13.390493,14.605342
1,Ensemble,13.048759,1.125362,13.719504,14.583912
2,Weighted_Ensemble,12.762585,1.132402,13.805559,14.056757
3,SVR,13.192176,1.230251,14.077248,14.634167
4,NuSVR,13.48581,0.814391,14.141957,14.220147
5,LightGBM,15.306682,0.92834,15.704966,16.755512
6,XGBoost,14.277844,1.780111,16.001747,16.142594
7,FCN,14.916318,1.698531,16.350252,16.674059


Weights:
RandomForest: 0.410
XGBoost: 0.000
LightGBM: 0.000
SVR: 0.284
NuSVR: 0.118
FCN: 0.189
 
 
**Copy and Paste following lines into the next cell.**
##### RandomForest
Best max_depth = 5  
Best n_estimators = 58  
Best R2 = 0.6256715903550362  
Weight = 0.410
##### XGBoost
Best max_depth = 4  
Best n_estimators = 189  
Best R2 = 0.5877245861952668  
Weight = 0.000
##### LightGBM
Best max_depth = 4  
Best n_estimators = 67  
Best R2 = 0.5674625412883484  
Weight = 0.000
##### SVR
Best C = 11.518378182409693  
Best kernel = sigmoid  
Best R2 = 0.6566459940695035  
Weight = 0.284
##### NuSVR
Best C = 0.570362585613368  
Best kernel = linear  
Best nu = 0.6108608664976618  
Best R2 = 0.6797318701856108  
Weight = 0.118
##### FCN
Best L2_factor = 0.02578546166740118  
Best dropout_factor = 0.34243967658484603  
Best R2 = 0.6010095954700934  
Weight = 0.189
 
 


##### RandomForest
Best max_depth = 6  
Best n_estimators = 118  
Best R2 = 0.58590582473441  
Weight = 0.000
##### XGBoost
Best max_depth = 2  
Best n_estimators = 46  
Best R2 = 0.5703384347670741  
Weight = 0.098
##### LightGBM
Best max_depth = 2  
Best n_estimators = 51  
Best R2 = 0.505207647156635  
Weight = 0.000
##### SVR
Best C = 0.14990594685121092  
Best kernel = linear  
Best R2 = 0.6340689844535041  
Weight = 0.661
##### NuSVR
Best C = 30.749226509121666  
Best kernel = rbf  
Best nu = 0.7479722390341714  
Best R2 = 0.6075744670435312  
Weight = 0.197
##### FCN
Best L2_factor = 0.0010602996524167788  
Best dropout_factor = 0.2182544879896376  
Best R2 = 0.5924729617089556  
Weight = 0.044

## 溫度

### 高溫

In [21]:
Y_feature = '最高氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.68it/s]


LinearRegression
Best R2 = 0.5538733358169409


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:39<00:00, 54.77s/it]


FCN
Best L2_factor = 0.0016713876561696985
Best dropout_factor = 0.22816777777281472
Best R2 = 0.5696605288176384


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:15<00:00,  1.95it/s]


RandomForest
Best max_depth = 8
Best n_estimators = 195
Best R2 = 0.5857433548541846


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:10<00:00,  2.99it/s]


XGBoost
Best max_depth = 3
Best n_estimators = 25
Best R2 = 0.5832692262670349


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 15.11it/s]


SVR
Best C = 0.0530938555233717
Best kernel = linear
Best R2 = 0.5691599259949109


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 34.48it/s]


NuSVR
Best C = 1.1703419803709574
Best kernel = linear
Best nu = 0.4806549699363593
Best R2 = 0.5749890373392107


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:09<00:00,  3.19it/s]


LightGBM
Best max_depth = 7
Best n_estimators = 52
Best R2 = 0.5886040366004186


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:00<00:00, 12.03s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,RandomForest,1.084352,0.087632,1.079866,1.245451
1,Weighted_Ensemble,1.046371,0.076856,1.088678,1.123768
2,SVR,1.072317,0.073179,1.110753,1.125063
3,Ensemble,1.065532,0.078677,1.124655,1.141473
4,LightGBM,1.080364,0.072192,1.149321,1.167973
5,LinearRegression,1.113703,0.079222,1.153975,1.207452
6,NuSVR,1.101816,0.081852,1.163335,1.172897
7,XGBoost,1.101507,0.10001,1.176288,1.239496
8,FCN,1.119185,0.066226,1.177173,1.177915


Weights:
LinearRegression: 0.000
FCN: 0.000
RandomForest: 0.192
XGBoost: 0.000
SVR: 0.382
NuSVR: 0.000
LightGBM: 0.426
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.5538733358169409  
Weight = 0.000
##### FCN
Best L2_factor = 0.0016713876561696985  
Best dropout_factor = 0.22816777777281472  
Best R2 = 0.5696605288176384  
Weight = 0.000
##### RandomForest
Best max_depth = 8  
Best n_estimators = 195  
Best R2 = 0.5857433548541846  
Weight = 0.192
##### XGBoost
Best max_depth = 3  
Best n_estimators = 25  
Best R2 = 0.5832692262670349  
Weight = 0.000
##### SVR
Best C = 0.0530938555233717  
Best kernel = linear  
Best R2 = 0.5691599259949109  
Weight = 0.382
##### NuSVR
Best C = 1.1703419803709574  
Best kernel = linear  
Best nu = 0.4806549699363593  
Best R2 = 0.5749890373392107  
Weight = 0.000
##### LightGBM
Best max_depth = 7  
Best n_estimators = 52  
Best R2 = 0.5886040366004186  
Weight = 0.426
 
 


##### FCN
Best L2_factor = 0.006366896218364759  
Best dropout_factor = 0.19396856681257302  
Best R2 = 0.6048279316550031  
##### RandomForest
Best max_depth = 4  
Best n_estimators = 47  
Best R2 = 0.6352240070534566  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 12  
Best R2 = 0.6272021513938951  
##### LightGBM
Best max_depth = 4  
Best n_estimators = 26  
Best R2 = 0.6061314180772585  
##### SVR
Best C = 0.05583870083012335  
Best kernel = linear  
Best R2 = 0.604340159328353  
##### NuSVR
Best C = 0.09975849736100006  
Best kernel = linear  
Best nu = 0.7091007437639489  
Best R2 = 0.6044918833713732

### 低溫

In [22]:
Y_feature = '最低氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.17it/s]


LinearRegression
Best R2 = 0.14978603084028827


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:31<00:00, 52.92s/it]


FCN
Best L2_factor = 0.011081867468176031
Best dropout_factor = 0.11693877198549735
Best R2 = 0.16990631172552542


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:13<00:00,  2.19it/s]


RandomForest
Best max_depth = 5
Best n_estimators = 157
Best R2 = 0.2977480630521038


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:08<00:00,  3.60it/s]


XGBoost
Best max_depth = 2
Best n_estimators = 13
Best R2 = 0.250154850563415


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 13.37it/s]


SVR
Best C = 0.01387771060407978
Best kernel = linear
Best R2 = 0.19300005311244378


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:02<00:00, 10.60it/s]


NuSVR
Best C = 0.46096049273842093
Best kernel = rbf
Best nu = 0.8366394649782175
Best R2 = 0.2375091479359269


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:07<00:00,  4.04it/s]


LightGBM
Best max_depth = 11
Best n_estimators = 29
Best R2 = 0.2702920072112119


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:53<00:00, 10.68s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,RandomForest,0.834889,0.074504,0.889315,0.914716
1,NuSVR,0.847274,0.079008,0.892881,0.928477
2,Weighted_Ensemble,0.824584,0.08123,0.896341,0.901767
3,Ensemble,0.84131,0.076419,0.898083,0.926287
4,LightGBM,0.831741,0.073379,0.8988,0.901253
5,XGBoost,0.83417,0.087076,0.901481,0.922774
6,FCN,0.871342,0.064561,0.918776,0.934279
7,LinearRegression,0.894463,0.079406,0.946359,0.991046
8,SVR,0.898168,0.059319,0.950898,0.974756


Weights:
LinearRegression: 0.000
FCN: 0.000
RandomForest: 0.190
XGBoost: 0.441
SVR: 0.000
NuSVR: 0.000
LightGBM: 0.370
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.14978603084028827  
Weight = 0.000
##### FCN
Best L2_factor = 0.011081867468176031  
Best dropout_factor = 0.11693877198549735  
Best R2 = 0.16990631172552542  
Weight = 0.000
##### RandomForest
Best max_depth = 5  
Best n_estimators = 157  
Best R2 = 0.2977480630521038  
Weight = 0.190
##### XGBoost
Best max_depth = 2  
Best n_estimators = 13  
Best R2 = 0.250154850563415  
Weight = 0.441
##### SVR
Best C = 0.01387771060407978  
Best kernel = linear  
Best R2 = 0.19300005311244378  
Weight = 0.000
##### NuSVR
Best C = 0.46096049273842093  
Best kernel = rbf  
Best nu = 0.8366394649782175  
Best R2 = 0.2375091479359269  
Weight = 0.000
##### LightGBM
Best max_depth = 11  
Best n_estimators = 29  
Best R2 = 0.2702920072112119  
Weight = 0.370
 
 


##### FCN
Best L2_factor = 0.006578980595757985  
Best dropout_factor = 0.2827579002701865  
Best R2 = 0.12796332166141297  
##### RandomForest
Best max_depth = 5  
Best n_estimators = 112  
Best R2 = 0.23976560718955822  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 13  
Best R2 = 0.26960230752782566  
##### LightGBM
Best max_depth = 6  
Best n_estimators = 69  
Best R2 = 0.14645030056603767  
##### SVR
Best C = 0.8539784496032792  
Best kernel = rbf  
Best R2 = 0.1879279073200762  
##### NuSVR
Best C = 0.5959731111440211  
Best kernel = rbf  
Best nu = 0.6158415473805077  
Best R2 = 0.18223342242299245  

### 平均溫

In [23]:
Y_feature = '氣溫'
model_xcols = {
    'LinearRegression': ['溫度'],
    'FCN': ['溫度'],
    'RandomForest': ['溫度'],
    'XGBoost': ['溫度'],
    'SVR': ['溫度'],
    'NuSVR': ['溫度'],
    'LightGBM': ['溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.66it/s]


LinearRegression
Best R2 = 0.41398139372708004


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:30<00:00, 52.57s/it]


FCN
Best L2_factor = 0.0011813421384085827
Best dropout_factor = 0.1520304634254095
Best R2 = 0.44494233769019986


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:13<00:00,  2.24it/s]


RandomForest
Best max_depth = 3
Best n_estimators = 124
Best R2 = 0.4844065781110033


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:08<00:00,  3.56it/s]


XGBoost
Best max_depth = 3
Best n_estimators = 12
Best R2 = 0.46015595923171226


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 24.86it/s]


SVR
Best C = 0.027619131207129343
Best kernel = linear
Best R2 = 0.44839943517297653


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:04<00:00,  7.05it/s]


NuSVR
Best C = 0.0334169706915242
Best kernel = linear
Best nu = 0.2769792606019767
Best R2 = 0.4592002928006137


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:06<00:00,  4.59it/s]


LightGBM
Best max_depth = 3
Best n_estimators = 42
Best R2 = 0.4582993916467588


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:49<00:00,  9.87s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.698564,0.012636,0.706285,0.715685
1,Ensemble,0.702122,0.014625,0.710511,0.710981
2,NuSVR,0.701812,0.029169,0.716671,0.727851
3,SVR,0.705092,0.037963,0.727631,0.741108
4,RandomForest,0.721861,0.011493,0.730559,0.733635
5,LinearRegression,0.711462,0.037814,0.733254,0.756418
6,XGBoost,0.72074,0.018939,0.734642,0.749273
7,LightGBM,0.734959,0.014102,0.746051,0.754342
8,FCN,0.723521,0.026979,0.755268,0.755947


Weights:
LinearRegression: 0.000
FCN: 0.000
RandomForest: 0.000
XGBoost: 0.410
SVR: 0.000
NuSVR: 0.590
LightGBM: 0.000
 
 
**Copy and Paste following lines into the next cell.**
##### LinearRegression
Best R2 = 0.41398139372708004  
Weight = 0.000
##### FCN
Best L2_factor = 0.0011813421384085827  
Best dropout_factor = 0.1520304634254095  
Best R2 = 0.44494233769019986  
Weight = 0.000
##### RandomForest
Best max_depth = 3  
Best n_estimators = 124  
Best R2 = 0.4844065781110033  
Weight = 0.000
##### XGBoost
Best max_depth = 3  
Best n_estimators = 12  
Best R2 = 0.46015595923171226  
Weight = 0.410
##### SVR
Best C = 0.027619131207129343  
Best kernel = linear  
Best R2 = 0.44839943517297653  
Weight = 0.000
##### NuSVR
Best C = 0.0334169706915242  
Best kernel = linear  
Best nu = 0.2769792606019767  
Best R2 = 0.4592002928006137  
Weight = 0.590
##### LightGBM
Best max_depth = 3  
Best n_estimators = 42  
Best R2 = 0.4582993916467588  
Weight = 0.000
 
 


##### FCN
Best L2_factor = 0.021113413887458573  
Best dropout_factor = 0.04274367293005721  
Best R2 = 0.4378907066278211  
##### RandomForest
Best max_depth = 4  
Best n_estimators = 195  
Best R2 = 0.4844221100077036  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 13  
Best R2 = 0.49093789697123874  
##### LightGBM
Best max_depth = 4  
Best n_estimators = 59  
Best R2 = 0.41187761974593445  
##### SVR
Best C = 0.018011043607669287  
Best kernel = linear  
Best R2 = 0.43636030323831526  
##### NuSVR
Best C = 0.43233258907999045  
Best kernel = linear  
Best nu = 0.39503695680211226  
Best R2 = 0.44385816055128674 

## 風速

In [24]:
Y_feature = '風速'
model_xcols = {
    'FCN': ['風速', '東西風', '南北風', '溫度'],
    'RandomForest': ['風速', '東西風', '南北風', '晴', '多雲', '陰', '短暫陣雨', '短暫陣雨或雷雨', '午後短暫雷陣雨', '陣雨或雷雨', '相對溼度', '溫度'],
    'XGBoost': ['風速', '東西風', '南北風', '溫度'],
    'LightGBM': ['風速', '東西風', '南北風', '溫度'],
    'SVR': ['風速', '東西風', '南北風', '溫度'],
    'NuSVR': ['風速', '東西風', '南北風', '溫度'],
}

flow_control(Y_feature, model_xcols, forecast_obs_df, speed_test=speed_test)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [04:09<00:00, 62.40s/it]


FCN
Best L2_factor = 0.2011096770080241
Best dropout_factor = 0.3972764762036274
Best R2 = 0.5532967042584036


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:40<00:00,  1.35s/it]


RandomForest
Best max_depth = 14
Best n_estimators = 146
Best R2 = 0.6239770661540684


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.08it/s]


XGBoost
Best max_depth = 11
Best n_estimators = 150
Best R2 = 0.5090891487925635


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:06<00:00,  4.44it/s]


LightGBM
Best max_depth = 6
Best n_estimators = 17
Best R2 = 0.575102325725823


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:54<00:00,  1.82s/it]


SVR
Best C = 5.7242894919250995
Best kernel = linear
Best R2 = 0.5940460533782137


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:30<00:00,  3.02s/it]


NuSVR
Best C = 199.9134778145975
Best kernel = linear
Best nu = 0.7582256867574979
Best R2 = 0.6391723864759884


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:24<00:00, 16.86s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile,10th percentile
0,Weighted_Ensemble,0.696323,0.156028,0.770203,0.923893
1,Ensemble,0.713814,0.163786,0.78497,0.943511
2,SVR,0.746828,0.138068,0.799529,0.945077
3,NuSVR,0.747398,0.142989,0.812571,0.95157
4,LightGBM,0.765767,0.166992,0.828557,1.026628
5,RandomForest,0.725019,0.156478,0.828786,0.937289
6,XGBoost,0.813609,0.193925,0.939654,1.087082
7,FCN,0.867697,0.173703,1.03195,1.048868


Weights:
FCN: 0.000
RandomForest: 0.391
XGBoost: 0.000
LightGBM: 0.000
SVR: 0.000
NuSVR: 0.609
 
 
**Copy and Paste following lines into the next cell.**
##### FCN
Best L2_factor = 0.2011096770080241  
Best dropout_factor = 0.3972764762036274  
Best R2 = 0.5532967042584036  
Weight = 0.000
##### RandomForest
Best max_depth = 14  
Best n_estimators = 146  
Best R2 = 0.6239770661540684  
Weight = 0.391
##### XGBoost
Best max_depth = 11  
Best n_estimators = 150  
Best R2 = 0.5090891487925635  
Weight = 0.000
##### LightGBM
Best max_depth = 6  
Best n_estimators = 17  
Best R2 = 0.575102325725823  
Weight = 0.000
##### SVR
Best C = 5.7242894919250995  
Best kernel = linear  
Best R2 = 0.5940460533782137  
Weight = 0.000
##### NuSVR
Best C = 199.9134778145975  
Best kernel = linear  
Best nu = 0.7582256867574979  
Best R2 = 0.6391723864759884  
Weight = 0.609
 
 


##### RandomForest
Best max_depth = 8  
Best n_estimators = 200  
Best R2 = 0.44476053459418435 
##### XGBoost
Best max_depth = 2  
Best n_estimators = 12  
Best R2 = 0.3950520622531771  
##### LightGBM
Best max_depth = 3  
Best n_estimators = 11  
Best R2 = 0.3505410125996822  
##### SVR
Best C = 0.4404862253598064  
Best kernel = linear  
Best R2 = 0.48936860838016016  
##### NuSVR
Best C = 0.017405206673103454  
Best kernel = linear  
Best nu = 0.623766887251527  
Best R2 = 0.45169718650072266