In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
#這兩行讓 matplotlib 的圖可以顯示中文，同時正常顯示負號
matplotlib.rc('font', family='Microsoft JhengHei')
plt.rcParams['axes.unicode_minus'] = False
import datetime
from copy import deepcopy
import os
import joblib
import json
from tqdm import tqdm
import optuna
import logging

from IPython.display import display

# 設置Optuna日誌級別為 WARNING，僅顯示警告及以上級別的信息
optuna.logging.set_verbosity(optuna.logging.WARNING)

pd.set_option('future.no_silent_downcasting', True)

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.svm import SVR, NuSVR

from scipy.optimize import minimize

In [3]:
from Pytorch_models.metrics import Array_Metrics
from Pytorch_models import models as pytorch_models
from Pytorch_models import api
MAE = Array_Metrics.mae
R2_score = Array_Metrics.r2

In [4]:
from utils.prepare_data import prepare_data
from utils.sun_light import calculate_daytime

# 初始化

In [5]:
start_date = '2023-08-01'
end_date = '2025-01-01'

In [6]:
historical_data_path = './historical/data/'
#train_model_path = './trained_model_parameters/model_B_with_one_month_delay/'
train_model_path = './trained_model_parameters/model_meta_2024-09-03/'

In [7]:
optuna_done = {
    '風力': False,
    '太陽能': False,
    '尖峰負載': False,
    '夜尖峰': False,
}

weights_determined = {
    '風力': False,
    '太陽能': False,
    '尖峰負載': False,
    '夜尖峰': False,
}

In [16]:
weather_power_df = prepare_data(historical_data_path, start_date=start_date, end_date=end_date)
weather_power_df['夜尖峰'] = [0 if se > 20 else 1 for se in weather_power_df['太陽能']]
print(weather_power_df.columns)
weather_power_df

Index(['日期', '尖峰負載', '核能', '燃煤', '汽電共生', '燃氣', '燃油', '水力', '風力', '太陽能',
       '氣溫_臺北', '最高氣溫_臺北', '最低氣溫_臺北', '風速_臺北', '全天空日射量_臺北', '總雲量_臺北', '東西風_臺北',
       '南北風_臺北', '氣溫_高雄', '最高氣溫_高雄', '最低氣溫_高雄', '風速_高雄', '全天空日射量_高雄', '總雲量_高雄',
       '東西風_高雄', '南北風_高雄', '氣溫_嘉義', '最高氣溫_嘉義', '最低氣溫_嘉義', '風速_嘉義', '全天空日射量_嘉義',
       '總雲量_嘉義', '東西風_嘉義', '南北風_嘉義', '氣溫_東吉島', '最高氣溫_東吉島', '最低氣溫_東吉島',
       '風速_東吉島', '全天空日射量_東吉島', '總雲量_東吉島', '東西風_東吉島', '南北風_東吉島', '氣溫_臺中電廠',
       '最高氣溫_臺中電廠', '最低氣溫_臺中電廠', '風速_臺中電廠', '東西風_臺中電廠', '南北風_臺中電廠', '日期數字',
       '假日', '週六', '週日', '補班', '1~3月', '11~12月', '白日長度', '夜尖峰'],
      dtype='object')


Unnamed: 0,日期,尖峰負載,核能,燃煤,汽電共生,燃氣,燃油,水力,風力,太陽能,...,南北風_臺中電廠,日期數字,假日,週六,週日,補班,1~3月,11~12月,白日長度,夜尖峰
0,2023-08-01,3667.5000,187.70,1105.90,139.60,1454.80,36.70,121.10,89.90,494.8000,...,2.954423,212.0,0,0,0,0,0,0,13.166667,0
1,2023-08-02,3666.3000,188.30,979.30,146.60,1499.20,92.90,180.00,88.90,462.1000,...,2.699589,213.0,0,0,0,0,0,0,13.150000,0
2,2023-08-03,3431.3000,189.40,1032.80,143.80,1352.70,37.20,157.60,75.00,412.7000,...,1.854368,214.0,0,0,0,0,0,0,13.150000,0
3,2023-08-04,3695.2000,189.60,1169.10,138.90,1403.30,37.30,117.40,143.60,460.9000,...,2.257742,215.0,0,0,0,0,0,0,13.116667,0
4,2023-08-05,3173.0000,189.90,1223.40,61.60,1337.00,38.60,183.70,111.30,0.0000,...,1.898843,216.0,0,1,0,0,0,0,13.116667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,2024-08-29,3997.0806,93.47,1208.45,176.20,1626.72,65.26,-10.90,5.82,747.8806,...,1.597478,606.0,0,0,0,0,0,0,12.616667,0
387,2024-08-30,3953.5619,93.47,1179.84,165.40,1615.99,60.69,-1.91,8.66,742.5219,...,2.068096,607.0,0,0,0,0,0,0,12.600000,0
388,2024-08-31,3572.7569,93.35,1071.82,73.08,1393.02,35.33,29.47,20.76,803.9969,...,2.161293,608.0,0,1,0,0,0,0,12.583333,0
389,2024-09-01,3372.6600,93.24,1130.09,82.33,1725.32,37.61,355.48,0.95,730.9367,...,1.597478,609.0,0,0,1,0,0,0,12.550000,0


# 函數

## FCN model API

In [9]:
def FCN_model(input_f, output_f, feature_counts, dropout_factor=0, L2_factor=1e-15, mode='regressor'):
    if mode == 'regressor':
        model = pytorch_models.SimpleNN(input_f, output_f, feature_counts, dropout_factor)
    elif mode == 'classifier':
        model = pytorch_models.SimpleNN_classifer(input_f, output_f, feature_counts, dropout_factor)
    Model_API = api.Model_API(model, L2_factor=L2_factor, classifer=(mode=='classifier'))
    return Model_API

## Hyper parameter tuning

這部分的函數有：  
1. get_XY: 從 DataFrame 中提取需要的 X 與 Y 兩個 numpy array。
2. five_fold_test: 執行一次 5-fold 測試，會呼叫 get_XY_from_forecast_and_observation。
3. hyperparameter_tuning: 針對特定的模型與超參數組合，呼叫 five_fold_test 執行多次 5-fold 測試，並回傳 R2 值。
4. optuna_operation: 利用第三方套件 optuna 執行超參數調整，會呼叫 hyperparameter_tuning。

流程控制函數 flow_control 會呼叫 optuna_operation，而主程式只會直接呼叫 flow_control。

In [10]:
def get_XY(data_df, Y_feature, X_features=None):
    date_related_cols = ['日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月', '白日長度']
    
    if Y_feature in ['最高氣溫', '最低氣溫', '氣溫', '風速', '日照率', '全天空日射量']:
        target = 'obs'
    elif Y_feature in ['風力', '太陽能', '尖峰負載', '夜尖峰']:
        target = 'pwd'

    X_cols = []
    if X_features is None:
        for this_col in data_df.columns:
            if '_' in this_col:
                X_cols.append(this_col)
        if target == 'pwd':
            X_cols += date_related_cols
    else:
        for col in data_df.columns:
            dash_splited = col.split('_')
            if len(dash_splited) >= 2:
                if dash_splited[0] in X_features:
                    X_cols.append(col)
            else:
                if col in date_related_cols and col in X_features:
                    X_cols.append(col)

    Xs = np.array(data_df[X_cols])
    Ys = np.array(data_df[Y_feature])

    return Xs, Ys, X_cols

In [None]:
def five_fold_test(Xs, Ys, model=XGBRegressor(), mode='regressor',
                   deep_learning=False, fold_n=5, standard_scale=True, always_test_last_chunk=False):
    
    def metric(Y_test, Y_pred, mode=mode):
        if mode == 'regressor':
            return 1 - np.mean((Y_test - Y_pred)**2) / np.var(Y_test)
        elif mode == 'classifier':
            return f1_score(Y_test, Y_pred)

    shuffle = not always_test_last_chunk
    kf = KFold(n_splits=fold_n, shuffle=shuffle)
    
    XY_folds = {}
    for i, (train_index, test_index) in enumerate(kf.split(Xs)):
        XY_folds[i] = (train_index, test_index)
    
    metric_test_list = []
    metric_train_list = []

    if always_test_last_chunk:
        iters = [fold_n-1]
    else:
        iters = range(fold_n)
    
    for i in iters:
        if deep_learning:
            input_f = model.model.params['input_f']
            output_f = model.model.params['output_f']
            feature_counts = model.model.params['feature_counts']
            dropout_factor = model.model.params['dropout_factor']
            L2_factor = model.L2_factor
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                              dropout_factor=dropout_factor, L2_factor=L2_factor,mode=mode)
            
        X_train = Xs[XY_folds[i][0]]
        X_test = Xs[XY_folds[i][1]]
        Y_train = Ys[XY_folds[i][0]]
        Y_test = Ys[XY_folds[i][1]]

        if deep_learning:
            X_train_DL, X_val, Y_train_DL, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
    
        if standard_scale:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            if deep_learning:
                X_val = scaler.transform(X_val)
            
        if deep_learning:
            _ = model.fit(X_train_DL, Y_train_DL, X_val, Y_val)
        else:
            _ = model.fit(X_train, Y_train)
    
        Y_pred = model.predict(X_test)
        metric_test_list.append(metric(Y_test, Y_pred))

        Y_pred = model.predict(X_train)
        metric_train_list.append(metric(Y_train, Y_pred))

    metric_test = np.mean(metric_test_list)
    metric_train = np.mean(metric_train_list)
    return metric_test, metric_train

In [16]:
def hyperparameter_tuning(trial, Xs, Ys, model_label='RandomForest', n_iters=50, always_test_last_chunk=False):
    deep_learning = False
    standard_scale = True
    if model_label in ['RandomForest', 'XGBoost', 'LightGBM']:
        cfg = {'max_depth': trial.suggest_int('max_depth', 2, 15),
               'n_estimators': trial.suggest_int('n_estimators', 10, 200)}
        max_depth = cfg['max_depth']
        n_estimators = cfg['n_estimators']
    
        if model_label == 'RandomForest':
            model = RandomForestRegressor(max_depth=max_depth, n_estimators=n_estimators)
        elif model_label == 'XGBoost':
            model = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators)
        elif model_label == 'LightGBM':
            model = LGBMRegressor(force_col_wise=True, verbose=-1, max_depth=max_depth, n_estimators=n_estimators)
    elif model_label == 'SVR':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])}
        C = cfg['C']
        kernel = cfg['kernel']
        model = SVR(C=C, kernel=kernel)
    elif model_label == 'NuSVR':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid']),
               'nu': trial.suggest_float('nu', 0.1, 0.9)}
        C = cfg['C']
        kernel = cfg['kernel']
        nu = cfg['nu']
        model = NuSVR(C=C, kernel=kernel, nu=nu)
    elif model_label == 'FCN':
        deep_learning = True
        standard_scale = False
        cfg = {'L2_factor': trial.suggest_float('L2_factor', 1e-3, 1, log=True),
               'dropout_factor': trial.suggest_float('dropout_factor', 0, 0.5)}
        L2_factor = cfg['L2_factor']
        dropout_factor = cfg['dropout_factor']
        input_f = Xs.shape[1] 
        output_f = 1 
        feature_counts = [16, 16, 16, 8]
        model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                          dropout_factor=dropout_factor, L2_factor=L2_factor)
    elif model_label == 'LinearRegression':
        model = LinearRegression()
        
    R2_list = []
    iterator = range(n_iters)
    for i in iterator:
        R2, _ = five_fold_test(Xs, Ys, model, deep_learning=deep_learning, standard_scale=standard_scale, always_test_last_chunk=always_test_last_chunk)
        R2_list.append(R2)

    return np.mean(R2_list) - np.std(R2_list)

In [17]:
def optuna_operation(model_xcols, Y_feature, weather_power_df,
                     optuna_n_trials=30, n_iters=20, always_test_last_chunk=False, afternoon_peak_only=True):
    model_hyperparameters_dict = {}
    model_r2_dict = {}
    
    if always_test_last_chunk:
        n_iters = 1

    model_labels = list(model_xcols.keys())
    
    for model_label in model_labels:
        X_features = model_xcols[model_label]
        Xs, Ys, _ = get_XY(weather_power_df, X_features, Y_feature)

        this_n_iters = n_iters
        this_optuna_n_trials = optuna_n_trials

        if model_label == 'FCN':
            this_n_iters = min(this_n_iters, 1)

        if model_label == 'LinearRegression':
            this_optuna_n_trials = 1
            this_n_iters = 1

        if afternoon_peak_only and Y_feature=='太陽能':
            flag = np.where(Ys>50)[0]
            Ys = Ys[flag]
            Xs = Xs[flag]

        def target_func(trial, model_label=model_label, Xs=Xs, Ys=Ys, n_iters=this_n_iters, always_test_last_chunk=always_test_last_chunk):
            return hyperparameter_tuning(trial, model_label=model_label, Xs=Xs, Ys=Ys, n_iters=n_iters, always_test_last_chunk=always_test_last_chunk)
        
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='maximize')
        with tqdm(total=this_optuna_n_trials) as pbar:
            for _ in range(this_optuna_n_trials):
                study.optimize(target_func, n_trials=1, catch=(Exception,))
                pbar.update(1)
        
        print(model_label)
        for key, v in study.best_params.items():
            print(f"Best {key} = {v}")
        print(f"Best R2 = {study.best_value}")
    
        model_hyperparameters_dict[model_label] = study.best_params
        model_r2_dict[model_label] = study.best_value

    return model_hyperparameters_dict, model_r2_dict

## Ensemble Learning

In [18]:
def cross_correlation_matrix(residuals):
    N = len(residuals)
    matrix = np.zeros((N, N))
    for i in range(N):
        for j in range(i, N):
            matrix[i][j] = np.mean(np.array(residuals[i]) * np.array(residuals[j]))

    for i in range(1, N):
        for j in range(i):
            matrix[i][j] = matrix[j][i]

    return matrix

In [19]:
def sovle_optimal_weights(matrix):
    N = matrix.shape[0]
    def objective(weights):
        return weights.T @ matrix @ weights

    initial_weights = np.array([1/N] * N)
    constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1})
    bounds = [(0, 1)] * N
    result = minimize(objective, initial_weights, method='SLSQP', bounds=bounds, constraints=constraints)
    
    optimal_weights = result.x
    return optimal_weights

In [None]:
def assign_model(model_label, Xs, model_hyperparameters_dict, mode='regressor'):
    if mode == 'regressor':
        if model_label == 'LinearRegression':
            model = LinearRegression()
        elif model_label == 'RandomForest':
            model = RandomForestRegressor(**model_hyperparameters_dict[model_label])
        elif model_label == 'XGBoost':
            model = XGBRegressor(**model_hyperparameters_dict[model_label])
        elif model_label == 'LightGBM':
            model = LGBMRegressor(force_col_wise=True, verbose=-1, **model_hyperparameters_dict[model_label])
        elif model_label == 'SVR':
            model = SVR(**model_hyperparameters_dict[model_label])
        elif model_label == 'NuSVR':
            model = NuSVR(**model_hyperparameters_dict[model_label])
        elif model_label == 'FCN':
            input_f = Xs.shape[1]
            output_f = 1
            feature_counts = [16, 16, 16, 8]
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                      **model_hyperparameters_dict[model_label])
    elif mode == 'classifier':
        if model_label == 'FCN':
            input_f = Xs.shape[1]
            output_f = 1
            feature_counts = [16, 16, 16, 8]
            model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                                mode='classifier', 
                                **model_hyperparameters_dict[model_label])
        elif model_label == 'RandomForest':
            model = RandomForestClassifier(**model_hyperparameters_dict[model_label])
        elif model_label == 'XGBoost':
            model = XGBClassifier(**model_hyperparameters_dict[model_label])
        elif model_label == 'LightGBM':
            model = LGBMClassifier(force_col_wise=True, verbose=-1, **model_hyperparameters_dict[model_label])
        elif model_label == 'SVC':
            model = SVC(**model_hyperparameters_dict[model_label])
        elif model_label == 'NuSVC':
            model = NuSVC(**model_hyperparameters_dict[model_label])
        elif model_label == 'LogisticRegression':
            model = LogisticRegression(**model_hyperparameters_dict[model_label])
    return model

In [20]:
def find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols,
                                         data_df, Y_feature, mode='regressor',
                                         n_iters=200, weights=None):

    if mode == 'regressor':
        metric_name = 'MAE'
    elif mode == 'classifier':
        metric_name = 'F1'
    
    if type(data_df) in [list, tuple]:
        if len(data_df) == 2:
            forecast_df, observation_df = data_df
            target = 'obs'
        else:
            raise Exception('Input data_df should be a DataFrame or a list contain 2 DataFrames.')
    else:
        target = 'pwd'
    
    def get_prediction(model_label, Y_train, train_ind, test_ind,
                       model_hyperparameters_dict=model_hyperparameters_dict,
                       model_xcols=model_xcols,
                       forecast_df=forecast_df,
                       observation_df=observation_df, 
                       Y_feature=Y_feature,
                       target=target):
        
        X_features = model_xcols[model_label]
        if target == 'pwd':
            Xs, _, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)
        elif target == 'obs':
            Xs, _, _, _ = get_XY_from_forecast_and_observation(forecast_df, observation_df, X_features, Y_feature)
        model = assign_model(model_label, Xs, model_hyperparameters_dict=model_hyperparameters_dict, mode=mode)

        deep_learning = False
        if model_label == 'FCN':
            deep_learning = True
    
        X_train = Xs[train_ind]
        X_test = Xs[test_ind]
        
        if deep_learning:
            X_train_dl, X_val, Y_train_dl, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
            _ = model.fit(X_train_dl, Y_train_dl, X_val, Y_val)
        else:
            scaler = StandardScaler()
            X_scaler = scaler.fit(X_train)
            X_train = X_scaler.transform(X_train)
            X_test = X_scaler.transform(X_test)
            _ = model.fit(X_train, Y_train)
        YP = model.predict(X_test)
        return YP        
    
    if weights is None:
        ensemble_models = list(model_hyperparameters_dict.keys())
    else:
        ensemble_models = list(weights.keys())
        
    Y_pred_iters = []
    Y_test_iters = []
    metric = []

    X_features=model_xcols[ensemble_models[0]]
    if target == 'pwd':
        Xs, Ys, _ = get_XY(data_df, Y_feature=Y_feature, X_features=X_features)
    elif target == 'obs':
        Xs, Ys, _, _ = get_XY_from_forecast_and_observation(forecast_df, observation_df, X_features, Y_feature)

    matrix = np.zeros((len(ensemble_models), len(ensemble_models)))
    for i in tqdm(range(n_iters)):
        train_ind, test_ind, _, _ = train_test_split(np.arange(Xs.shape[0]), np.arange(Xs.shape[0]), test_size=0.20)
        
        Y_train = Ys[train_ind]
        Y_test = Ys[test_ind]
        
        Y_preds = []
        this_metric = []
        for model_label in ensemble_models:
            YP = get_prediction(model_label, Y_train, train_ind, test_ind)
            if mode == 'regressor':
                this_metric.append(MAE(Y_test, YP))
            elif mode == 'classifier':
                YP[np.where(YP<0.5)] = 0
                YP[np.where(YP>=0.5)] = 1
                this_metric.append(f1_score(Y_test, YP))
            Y_preds.append(YP)
            
        residuals = Y_preds - np.array([Y_test] * len(Y_preds)).reshape(len(Y_preds),-1)
        if weights is None:
            matrix += cross_correlation_matrix(residuals)

        uniform_ensemble_YP = np.mean(Y_preds, axis=0)
        if mode == 'regressor':
            this_metric.append(MAE(Y_test, uniform_ensemble_YP))
        elif mode == 'classifier':
            uniform_ensemble_YP[np.where(uniform_ensemble_YP<0.5)] = 0
            uniform_ensemble_YP[np.where(uniform_ensemble_YP>=0.5)] = 1
            this_metric.append(f1_score(Y_test, uniform_ensemble_YP))

        metric.append(this_metric)
        Y_pred_iters.append(Y_preds)
        Y_test_iters.append(Y_test)

    if weights is None:
        matrix = matrix / n_iters
        optimal_weights = sovle_optimal_weights(matrix)
    else:
        optimal_weights = weights

    weighted_metric = []
    for i in range(n_iters):
        weighted_YP = np.sum(Y_pred_iters[i] * np.concatenate([optimal_weights.reshape(-1,1),] * Y_test_iters[0].shape[0], axis = 1), axis=0)
        if mode == 'regressor':
            weighted_metric.append(MAE(Y_test_iters[i], weighted_YP))
        elif mode == 'classifier':
            weighted_YP[np.where(weighted_YP<0.5)] = 0
            weighted_YP[np.where(weighted_YP>=0.5)] = 1
            weighted_metric.append(f1_score(Y_test_iters[i], weighted_YP))
    weighted_metric = np.array(weighted_metric).reshape(-1, 1)
    array_metric = np.array(metric)
    array_metric = np.concatenate([metric, weighted_metric], axis=1)
    
    metric_dict = {
        'Model': ensemble_models + ['Ensemble', 'Weighted_Ensemble'],
        f'Avg {metric_name}': list(np.mean(array_metric, axis=0)), 
        f'Std {metric_name}': list(np.std(array_metric, axis=0)),
        '90th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.9) - 1]),
        '10th percentile': list(np.sort(array_metric, axis=0)[int(array_metric.shape[0] * 0.1) - 1])
        }
    
    df = pd.DataFrame(metric_dict)
    if mode == 'regressor':
        df = df.sort_values('90th percentile').reset_index(drop=True)
    elif mode == 'classifier':
        df = df.sort_values('10th percentile').reset_index(drop=True)

    if weights is not None:
        return df
        
    optimal_weights_dict = {}
    for i, w in enumerate(optimal_weights):
        optimal_weights_dict[ensemble_models[i]] = w
        
    return df, optimal_weights_dict

In [21]:
def save_model_metadata(file_path, model_xcols, model_hyperparameters_dict, optimal_weights):
    model_labels = list(model_hyperparameters_dict)
    output_dict = {
        'X_feature_dict':{},
        'hyperparameters_dict':{},
        'weights':{}
    }
    for model_label in model_labels:
        if optimal_weights[model_label] > 0.0005:
            output_dict['X_feature_dict'][model_label] = model_xcols[model_label]
            output_dict['hyperparameters_dict'][model_label] = model_hyperparameters_dict[model_label]
            output_dict['weights'][model_label] = optimal_weights[model_label]

    with open(file_path, 'w') as f:
        json.dump(output_dict, f)

### 流程控制

In [25]:
def flow_control(Y_feature, model_xcols, data_df, speed_test=False, mode='regression',
                 train_model_path=train_model_path, optuna_done=optuna_done, weights_determined=weights_determined):

    if mode == 'regression':
        metric_name = 'MAE'
    elif mode == 'classifier':
        metric_name = 'F1'
    
    n_iter_dict = {
        'hyper_parameter': 30,
        'ensemble_weight': 200
    }
    if speed_test:
        n_iter_dict = {
            'hyper_parameter': 1,
            'ensemble_weight': 20
        }
    
    this_model_path = f'{train_model_path}{Y_feature}/'
    os.makedirs(this_model_path, exist_ok=True)

    # 如果指定的 meta 檔存在，並且初始參數規定不須重新計算，則套用存檔數值。
    if os.path.exists(f'{train_model_path}{Y_feature}/meta.json'):
        with open(f'{train_model_path}{Y_feature}/meta.json', 'r') as f:
            meta = json.load(f)
    else:
        optuna_done[Y_feature] = False
        weights_determined[Y_feature] = False

    # 超參數
    if optuna_done[Y_feature]:
        model_xcols = meta['X_feature_dict']
        model_hyperparameters_dict = meta['hyperparameters_dict']
    else: 
        model_hyperparameters_dict, model_r2_dict = optuna_operation(model_xcols, Y_feature, data_df, n_iters=n_iter_dict['hyper_parameter'])

    # 集成權重
    if weights_determined[Y_feature]:
        optimal_weights = meta['weights']
        df = pd.read_csv(f'{this_model_path}predict_{metric_name}.df')
        display(df)
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')
    else:
        if 'FCN' in model_hyperparameters_dict.keys():
            n_iters = int(n_iter_dict['ensemble_weight']/4)
        else:
            n_iters = n_iter_dict['ensemble_weight']
        df, optimal_weights = find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols, data_df, Y_feature, n_iters=n_iters)
        display(df)
        df.to_csv(f'{this_model_path}predict_{metric_name}.df', index=False, encoding='utf-8-sig')
        print('Weights:')
        for i, k in enumerate(model_hyperparameters_dict.keys()):
            print(f'{k}: {optimal_weights[k]:.3f}')

    print(' ')
    print(' ')
    print('**Copy and Paste following lines into the next cell.**')
    for model_label in model_hyperparameters_dict.keys():
        print('##### ' + model_label)
        for key, v in model_hyperparameters_dict[model_label].items():
            print(f"Best {key} = {v}  ")
        if 'model_r2_dict' in locals().keys():
            print(f"Best R2 = {model_r2_dict[model_label]}  ")
        print(f'Weight = {optimal_weights[model_label]:.3f}')
    print(' ')
    print(' ')
    
    save_model_metadata(Y_feature, this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

# 風力

## 超參數

In [19]:
Y_feature = '風力'

model_xcols = {
    'LinearRegression': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'FCN': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'RandomForest': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'XGBoost': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'LightGBM': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'SVR': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'NuSVR': ['風速', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
}

if not optuna_done[Y_feature]:
    model_hyperparameters_dict = optuna_operation(model_xcols, Y_feature, weather_power_df)
else:
    model_hyperparameters_dict = {
        'LinearRegression': {},
        'FCN': {'L2_factor':0.013006, 'dropout_factor':0.39590},
        'RandomForest': {'max_depth': 10, 'n_estimators': 71},
        'XGBoost': {'max_depth': 2, 'n_estimators': 76},
        'LightGBM': {'max_depth': 13, 'n_estimators': 76},
        'SVR': {'C': 151.362, 'kernel': 'rbf'},
        'NuSVR': {'C': 6.6276, 'kernel': 'linear', 'nu': 0.3052}
    }

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 21.14it/s]


LinearRegression
Best R2 = 0.6280852942234002


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [38:06<00:00, 76.21s/it]


FCN
Best L2_factor = 0.013005685305042187
Best dropout_factor = 0.3959026641543964
Best R2 = 0.7200460389841302


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [12:34<00:00, 25.15s/it]


RandomForest
Best max_depth = 10
Best n_estimators = 71
Best R2 = 0.737423921427763


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [04:36<00:00,  9.23s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 76
Best R2 = 0.72186083780968


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [03:41<00:00,  7.38s/it]


LightGBM
Best max_depth = 13
Best n_estimators = 76
Best R2 = 0.7303359820446095


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:43<00:00,  1.45s/it]


SVR
Best C = 151.36176482779152
Best kernel = rbf
Best R2 = 0.7489775421486635


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:29<00:00,  2.97s/it]

NuSVR
Best C = 6.627594643188466
Best kernel = linear
Best nu = 0.30521882998925987
Best R2 = 0.6464490390568155
##### LinearRegression
Best R2 = 0.6280852942234002  
##### FCN
Best L2_factor = 0.013005685305042187  
Best dropout_factor = 0.3959026641543964  
Best R2 = 0.7200460389841302  
##### RandomForest
Best max_depth = 10  
Best n_estimators = 71  
Best R2 = 0.737423921427763  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 76  
Best R2 = 0.72186083780968  
##### LightGBM
Best max_depth = 13  
Best n_estimators = 76  
Best R2 = 0.7303359820446095  
##### SVR
Best C = 151.36176482779152  
Best kernel = rbf  
Best R2 = 0.7489775421486635  
##### NuSVR
Best C = 6.627594643188466  
Best kernel = linear  
Best nu = 0.30521882998925987  
Best R2 = 0.6464490390568155  





##### LinearRegression
Best R2 = 0.6280852942234002  
##### FCN
Best L2_factor = 0.013005685305042187  
Best dropout_factor = 0.3959026641543964  
Best R2 = 0.7200460389841302  
##### RandomForest
Best max_depth = 10  
Best n_estimators = 71  
Best R2 = 0.737423921427763  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 76  
Best R2 = 0.72186083780968  
##### LightGBM
Best max_depth = 13  
Best n_estimators = 76  
Best R2 = 0.7303359820446095  
##### SVR
Best C = 151.36176482779152  
Best kernel = rbf  
Best R2 = 0.7489775421486635  
##### NuSVR
Best C = 6.627594643188466  
Best kernel = linear  
Best nu = 0.30521882998925987  
Best R2 = 0.6464490390568155   

## 集成

In [20]:
if weights_determined[Y_feature]:
    optimal_weights = {
        'LinearRegression': 0.000,
        'FCN': 0.122,
        'RandomForest': 0.023,
        'XGBoost': 0.410,
        'LightGBM': 0.130,
        'SVR': 0.300,
        'NuSVR': 0.015,
    }
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')
else:
    if 'FCN' in model_hyperparameters_dict.keys():
        n_iters = 50
    else:
        n_iters = 200
    df, optimal_weights = find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols, weather_power_df, Y_feature, n_iters=n_iters)
    display(df)
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')

this_model_path = train_model_path + f'{Y_feature}/'
os.makedirs(this_model_path, exist_ok=True)

save_model_metadata(Y_feature, this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [15:18<00:00, 18.37s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile
0,Weighted_Ensemble,24.199624,2.085586,27.01472
1,XGBoost,24.934541,2.325935,27.778277
2,LightGBM,25.360287,2.289458,28.179406
3,RandomForest,25.745485,2.381327,28.41214
4,SVR,26.475461,2.236221,28.824463
5,Ensemble,25.858541,2.249777,29.041296
6,FCN,28.323991,2.764928,32.447912
7,LinearRegression,33.064959,2.697455,37.106278
8,NuSVR,34.024622,2.593434,37.299503


Weights:
LinearRegression: 0.000
FCN: 0.122
RandomForest: 0.023
XGBoost: 0.410
LightGBM: 0.130
SVR: 0.300
NuSVR: 0.015


# 太陽能

## 超參數

In [22]:
Y_feature = '太陽能'

universal_xcols = ['氣溫', '最高氣溫', '最低氣溫', '全天空日射量', '日期數字', '假日', '週六', '週日', '補班', '白日長度']

model_xcols = {
    'LinearRegression': universal_xcols,
    'FCN': universal_xcols,
    'RandomForest': universal_xcols,
    'XGBoost': universal_xcols,
    'LightGBM': universal_xcols,
    'SVR': universal_xcols,
    'NuSVR': universal_xcols,
}

if not optuna_done[Y_feature]:
    model_hyperparameters_dict = optuna_operation(model_xcols, Y_feature, weather_power_df, afternoon_peak_only=False)
else:
    model_hyperparameters_dict = {
        'LinearRegression': {},
        'FCN': {'L2_factor': 0.10538, 'dropout_factor': 0.33064},
        'RandomForest': {'max_depth': 7, 'n_estimators': 86},
        'XGBoost': {'max_depth': 2, 'n_estimators': 48},
        'LightGBM': {'max_depth': 2, 'n_estimators': 52},
        'SVR': {'C': 3.9957, 'kernel': 'linear'},
        'NuSVR': {'C': 9.01112, 'kernel': 'linear', 'nu': 0.5692}
    }

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.50it/s]


LinearRegression
Best R2 = 0.6875719046272957


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [38:30<00:00, 77.03s/it]


FCN
Best L2_factor = 0.018184979763796686
Best dropout_factor = 0.37066766054759914
Best R2 = 0.7101254497981448


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [10:40<00:00, 21.33s/it]


RandomForest
Best max_depth = 13
Best n_estimators = 63
Best R2 = 0.6804339794603214


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [04:20<00:00,  8.68s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 41
Best R2 = 0.6878504561506845


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [02:16<00:00,  4.55s/it]


LightGBM
Best max_depth = 7
Best n_estimators = 38
Best R2 = 0.6549993136006089


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:38<00:00,  1.29s/it]


SVR
Best C = 9.259567086481182
Best kernel = linear
Best R2 = 0.6769261782554706


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:03<00:00,  2.11s/it]

NuSVR
Best C = 10.228824707487302
Best kernel = linear
Best nu = 0.4459203293886881
Best R2 = 0.6798091490334639
##### LinearRegression
Best R2 = 0.6875719046272957  
##### FCN
Best L2_factor = 0.018184979763796686  
Best dropout_factor = 0.37066766054759914  
Best R2 = 0.7101254497981448  
##### RandomForest
Best max_depth = 13  
Best n_estimators = 63  
Best R2 = 0.6804339794603214  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 41  
Best R2 = 0.6878504561506845  
##### LightGBM
Best max_depth = 7  
Best n_estimators = 38  
Best R2 = 0.6549993136006089  
##### SVR
Best C = 9.259567086481182  
Best kernel = linear  
Best R2 = 0.6769261782554706  
##### NuSVR
Best C = 10.228824707487302  
Best kernel = linear  
Best nu = 0.4459203293886881  
Best R2 = 0.6798091490334639  





##### LinearRegression
Best R2 = 0.6635612498392781  
##### FCN
Best L2_factor = 0.1053827379092039  
Best dropout_factor = 0.33064438957159265  
Best R2 = 0.7176992185797516  
##### RandomForest
Best max_depth = 7  
Best n_estimators = 86  
Best R2 = 0.7111976622077838  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 48  
Best R2 = 0.6828467775270773  
##### LightGBM
Best max_depth = 2  
Best n_estimators = 52  
Best R2 = 0.7244731778954127  
##### SVR
Best C = 3.9956951625086528  
Best kernel = linear  
Best R2 = 0.6759829409350249  
##### NuSVR
Best C = 9.011120964017998  
Best kernel = linear  
Best nu = 0.5691703607433763  
Best R2 = 0.6795263138372115

## 集成

In [23]:
if weights_determined[Y_feature]:
    optimal_weights = {
        'LinearRegression': 0.134,
        'FCN': 0.365,
        'RandomForest': 0.255,
        'XGBoost': 0.246,
        'LightGBM': 0.000,
        'SVR': 0.000,
        'NuSVR': 0.000
    }
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')
else:
    if 'FCN' in model_hyperparameters_dict.keys():
        n_iters = 50
    else:
        n_iters = 200
    df, optimal_weights = find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols, weather_power_df, Y_feature, n_iters=n_iters)
    display(df)
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')

this_model_path = train_model_path + f'{Y_feature}/'
os.makedirs(this_model_path, exist_ok=True)

save_model_metadata(Y_feature, this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [13:42<00:00, 16.45s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile
0,Weighted_Ensemble,95.205535,9.997782,107.06464
1,Ensemble,98.24866,9.591001,109.546216
2,RandomForest,98.239794,10.756025,110.935188
3,LightGBM,104.635877,9.261996,115.623725
4,XGBoost,103.563825,10.366533,116.412344
5,NuSVR,110.886522,8.780802,123.151892
6,FCN,102.917325,13.631693,124.791749
7,LinearRegression,112.42371,9.528354,125.082204
8,SVR,112.850473,8.976209,125.269122


Weights:
LinearRegression: 0.199
FCN: 0.335
RandomForest: 0.181
XGBoost: 0.206
LightGBM: 0.080
SVR: 0.000
NuSVR: 0.000


# 尖峰負載

## 超參數

In [23]:
Y_feature = '尖峰負載'

model_xcols = {
    'LinearRegression': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'SVR': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
    'NuSVR': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '1~3月', '11~12月'],
}

if not optuna_done[Y_feature]:
    model_hyperparameters_dict = optuna_operation(model_xcols, Y_feature, weather_power_df)
else:
    model_hyperparameters_dict = {
        'LinearRegression': {},
        'FCN': {'L2_factor': 0.06428, 'dropout_factor': 0.16268},
        'RandomForest': {'max_depth': 12, 'n_estimators': 84},
        'XGBoost': {'max_depth': 2, 'n_estimators': 87},
        'LightGBM': {'max_depth': 14, 'n_estimators': 80},
        'SVR': {'C': 196.04, 'kernel': 'rbf'},
        'NuSVR': {'C': 25.491, 'kernel': 'linear', 'nu': 0.3246}
    }

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 17.39it/s]


LinearRegression
Best R2 = 0.9084907998139133


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [37:35<00:00, 75.20s/it]


FCN
Best L2_factor = 0.06427546813184054
Best dropout_factor = 0.16268006046854297
Best R2 = 0.9454512253809562


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [12:55<00:00, 25.86s/it]


RandomForest
Best max_depth = 12
Best n_estimators = 84
Best R2 = 0.9121767406884245


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [05:59<00:00, 11.97s/it]


XGBoost
Best max_depth = 2
Best n_estimators = 87
Best R2 = 0.930225265253464


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [04:35<00:00,  9.20s/it]


LightGBM
Best max_depth = 14
Best n_estimators = 80
Best R2 = 0.8808688778872141


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:33<00:00,  1.12s/it]


SVR
Best C = 196.04254877334432
Best kernel = rbf
Best R2 = 0.9361217780088962


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:01<00:00,  2.04s/it]

NuSVR
Best C = 25.491055542125657
Best kernel = linear
Best nu = 0.32464880623200465
Best R2 = 0.9037566008510232
##### LinearRegression
Best R2 = 0.9084907998139133  
##### FCN
Best L2_factor = 0.06427546813184054  
Best dropout_factor = 0.16268006046854297  
Best R2 = 0.9454512253809562  
##### RandomForest
Best max_depth = 12  
Best n_estimators = 84  
Best R2 = 0.9121767406884245  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 87  
Best R2 = 0.930225265253464  
##### LightGBM
Best max_depth = 14  
Best n_estimators = 80  
Best R2 = 0.8808688778872141  
##### SVR
Best C = 196.04254877334432  
Best kernel = rbf  
Best R2 = 0.9361217780088962  
##### NuSVR
Best C = 25.491055542125657  
Best kernel = linear  
Best nu = 0.32464880623200465  
Best R2 = 0.9037566008510232  





##### LinearRegression
Best R2 = 0.9084907998139133  
##### FCN
Best L2_factor = 0.06427546813184054  
Best dropout_factor = 0.16268006046854297  
Best R2 = 0.9454512253809562  
##### RandomForest
Best max_depth = 12  
Best n_estimators = 84  
Best R2 = 0.9121767406884245  
##### XGBoost
Best max_depth = 2  
Best n_estimators = 87  
Best R2 = 0.930225265253464  
##### LightGBM
Best max_depth = 14  
Best n_estimators = 80  
Best R2 = 0.8808688778872141  
##### SVR
Best C = 196.04254877334432  
Best kernel = rbf  
Best R2 = 0.9361217780088962  
##### NuSVR
Best C = 25.491055542125657  
Best kernel = linear  
Best nu = 0.32464880623200465  
Best R2 = 0.9037566008510232  

## 集成

In [24]:
if weights_determined[Y_feature]:
    optimal_weights = {
        'LinearRegression': 0.000,
        'FCN': 0.273,
        'RandomForest': 0.000,
        'XGBoost': 0.198,
        'LightGBM': 0.065,
        'SVR': 0.463,
        'NuSVR': 0.000
    }
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')
else:
    if 'FCN' in model_hyperparameters_dict.keys():
        n_iters = 50
    else:
        n_iters = 200
    df, optimal_weights = find_avg_score_with_given_model_list(model_hyperparameters_dict, model_xcols, weather_power_df, Y_feature, n_iters=n_iters)
    display(df)
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')

this_model_path = train_model_path + f'{Y_feature}/'
os.makedirs(this_model_path, exist_ok=True)

save_model_metadata(Y_feature, this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [13:11<00:00, 15.82s/it]


Unnamed: 0,Model,Avg MAE,Std MAE,90th percentile
0,Weighted_Ensemble,56.46706,5.360927,64.073659
1,SVR,57.630002,6.102505,64.643932
2,Ensemble,67.834988,6.890786,76.767072
3,FCN,69.571028,9.613673,78.913223
4,XGBoost,73.844534,6.641479,82.271831
5,RandomForest,77.023023,9.758401,88.121464
6,LightGBM,93.276944,10.538114,106.600944
7,LinearRegression,114.109195,8.703702,124.941865
8,NuSVR,117.0021,8.555598,127.197825


Weights:
LinearRegression: 0.000
FCN: 0.273
RandomForest: 0.000
XGBoost: 0.198
LightGBM: 0.065
SVR: 0.463
NuSVR: 0.000


# 夜尖峰

In [25]:
from sklearn.svm import SVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [26]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

## 超參數

In [28]:
def five_fold_classify_test(Xs, Ys, model=XGBRegressor(), deep_learning=False, fold_n=5, standard_scale=True, always_test_last_chunk=False):

    shuffle = not always_test_last_chunk
    kf = KFold(n_splits=fold_n, shuffle=shuffle)
    
    XY_folds = {}
    for i, (train_index, test_index) in enumerate(kf.split(Xs)):
        XY_folds[i] = (train_index, test_index)
    
    f1_test_list = []
    f1_train_list = []

    if always_test_last_chunk:
        iters = [fold_n-1]
    else:
        iters = range(fold_n)
    
    for i in iters:
        X_train = Xs[XY_folds[i][0]]
        X_test = Xs[XY_folds[i][1]]
        Y_train = Ys[XY_folds[i][0]]
        Y_test = Ys[XY_folds[i][1]]

        if deep_learning:
            X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.20)
    
        if standard_scale:
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            
        if deep_learning:
            _ = model.fit(X_train, Y_train, X_val, Y_val)
        else:
            _ = model.fit(X_train, Y_train)

        Y_pred = model.predict(X_test)
        # if deep_learning:
        #     Y_pred[np.where(Y_pred<0.5)] = 0
        #     Y_pred[np.where(Y_pred>=0.5)] = 1
        f1 = f1_score(Y_test, Y_pred)
        f1_test_list.append(f1)   

        Y_pred = model.predict(X_train)
        # if deep_learning:
        #     Y_pred[np.where(Y_pred<0.5)] = 0
        #     Y_pred[np.where(Y_pred>=0.5)] = 1
        f1 = f1_score(Y_train, Y_pred)
        f1_train_list.append(f1)

    # if deep_learning:
    #     print(R2_train_list, R2_test_list)

    f1_test = np.mean(f1_test_list)
    f1_train = np.mean(f1_train_list)
    return f1_test, f1_train

In [29]:
def classfier_hyperparameter_tuning(trial, Xs, Ys, model_label='RandomForest', n_iters=50, always_test_last_chunk=False):
    deep_learning = False
    standard_scale = True
    if model_label in ['RandomForest', 'XGBoost', 'LightGBM']:
        cfg = {'max_depth': trial.suggest_int('max_depth', 2, 15),
               'n_estimators': trial.suggest_int('n_estimators', 10, 100)}
        max_depth = cfg['max_depth']
        n_estimators = cfg['n_estimators']
    
        if model_label == 'RandomForest':
            model = RandomForestClassifier(max_depth=max_depth, n_estimators=n_estimators)
        elif model_label == 'XGBoost':
            model = XGBClassifier(max_depth=max_depth, n_estimators=n_estimators)
        elif model_label == 'LightGBM':
            model = LGBMClassifier(force_col_wise=True, verbose=-1, max_depth=max_depth, n_estimators=n_estimators)
    elif model_label == 'SVC':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'kernel': trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])}
        C = cfg['C']
        kernel = cfg['kernel']
        model = SVC(C=C, kernel=kernel)
    elif model_label == 'LogisticRegression':
        cfg = {'C': trial.suggest_float('C', 1e-3, 2e2, log=True),
               'solver': trial.suggest_categorical('solver', ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'])}
        C = cfg['C']
        solver = cfg['solver']
        model = LogisticRegression(C=C, solver=solver)
    elif model_label == 'FCN':
        deep_learning = True
        standard_scale = False
        cfg = {'L2_factor': trial.suggest_float('L2_factor', 1e-3, 1, log=True),
               'dropout_factor': trial.suggest_float('dropout_factor', 0, 0.5)}
        L2_factor = cfg['L2_factor']
        dropout_factor = cfg['dropout_factor']
        input_f = Xs.shape[1] 
        output_f = 1 
        feature_counts = [16, 16, 16, 8]
        model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                          dropout_factor=dropout_factor, L2_factor=L2_factor, mode='classifier')

    f1_list = []
    if deep_learning:
        iterator = tqdm(range(n_iters))
    else:
        iterator = range(n_iters)
    for i in iterator:
        f1, _ = five_fold_classify_test(Xs, Ys, model, deep_learning=deep_learning, standard_scale=standard_scale, always_test_last_chunk=always_test_last_chunk)
        f1_list.append(f1)

    return np.mean(f1_list) - np.std(f1_list)

In [31]:
Y_feature = '夜尖峰'
model_xcols = {
    'FCN': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'LogisticRegression': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'RandomForest': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'XGBoost': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'LightGBM': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度'],
    'SVC': ['氣溫', '最高氣溫', '最低氣溫', '日期數字', '假日', '週六', '週日', '補班', '白日長度']
}

if not optuna_done[Y_feature]:
    model_hyperparameters_dict = {}
    model_r2_dict = {}
    optuna_n_trials = 30
    n_iters = 20
    always_test_last_chunk = False
    if always_test_last_chunk:
        n_iters = 1

    model_labels = model_xcols.keys()
    
    for model_label in model_labels:
        X_features = model_xcols[model_label]
        Xs, Ys, X_cols = get_XY(weather_power_df, Y_feature=Y_feature)
        if model_label == 'FCN':
            n_iters = min(n_iters, 5)
        def target_func(trial, model_label=model_label, Xs=Xs, Ys=Ys, n_iters=n_iters, always_test_last_chunk=always_test_last_chunk):
            return classfier_hyperparameter_tuning(trial, model_label=model_label, Xs=Xs, Ys=Ys, n_iters=n_iters, always_test_last_chunk=always_test_last_chunk)
        
        sampler = optuna.samplers.TPESampler()
        study = optuna.create_study(sampler=sampler, direction='maximize')

        if model_label == 'FCN':
            optuna.logging.set_verbosity(optuna.logging.INFO)
            study.optimize(target_func, n_trials=optuna_n_trials)
        else:
            optuna.logging.set_verbosity(optuna.logging.WARNING)
            with tqdm(total=optuna_n_trials) as pbar:
                for _ in range(optuna_n_trials):
                    study.optimize(target_func, n_trials=1, catch=(Exception,))
                    pbar.update(1)
    
        print(model_label)
        for key, v in study.best_params.items():
            print(f"Best {key} = {v}")
        print(f"Best F1 = {study.best_value}")
    
        model_hyperparameters_dict[model_label] = study.best_params
        model_r2_dict[model_label] = study.best_value
        
    for model_label in model_labels:
        print('##### ' + model_label)
        for key, v in model_hyperparameters_dict[model_label].items():
            print(f"Best {key} = {v}  ")
        print(f"Best F1 = {model_r2_dict[model_label]}  ")
else:
    model_hyperparameters_dict = {
        'FCN': {'L2_factor': 0.004164, 'dropout_factor': 0.04681},
        'LogisticRegression': {'C': 1.734, 'solver': 'lbfgs'},
        'RandomForest': {'max_depth': 12, 'n_estimators': 66},
        'XGBoost': {'max_depth': 13, 'n_estimators': 58},
        'LightGBM': {'max_depth': 4, 'n_estimators': 47},
        'SVC': {'C': 0.5998, 'kernel': 'linear'}
    }

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:39<00:00, 55.80s/it]
[I 2024-08-23 07:59:49,576] Trial 0 finished with value: 0.9019225096322319 and parameters: {'L2_factor': 0.007954844122301383, 'dropout_factor': 0.12922623333443561}. Best is trial 0 with value: 0.9019225096322319.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:33<00:00, 54.74s/it]
[I 2024-08-23 08:04:23,302] Trial 1 finished with value: 0.915618958116226 and parameters: {'L2_factor': 0.007004763414176608, 'dropout_factor': 0.22686705584943279}. Best is trial 1 with value: 0.915618958116226.
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:10<00:00, 50.19s/it]
[I 2024-08-23 08:08:34,274] Trial 2 finished with value: 0.8892158035110497 and parameters: {'L2_factor': 0.0013742869015804546, 'dropout_factor': 0.3541162880512968}. Best is trial 1 with value: 0.915618958116

FCN
Best L2_factor = 0.004163987402538657
Best dropout_factor = 0.04681459826313567
Best F1 = 0.9357051406546071


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:12<00:00,  2.37it/s]


LogisticRegression
Best C = 1.734469249164027
Best solver = lbfgs
Best F1 = 0.8651748530262853


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:43<00:00,  1.45s/it]


RandomForest
Best max_depth = 12
Best n_estimators = 66
Best F1 = 0.7867848119055804


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:35<00:00,  1.20s/it]


XGBoost
Best max_depth = 13
Best n_estimators = 58
Best F1 = 0.8620380494806515


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:51<00:00,  1.71s/it]


LightGBM
Best max_depth = 4
Best n_estimators = 47
Best F1 = 0.8034221631560704


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:05<00:00,  5.28it/s]

SVC
Best C = 0.5998126005686496
Best kernel = linear
Best F1 = 0.867829947637113
##### FCN
Best L2_factor = 0.004163987402538657  
Best dropout_factor = 0.04681459826313567  
Best F1 = 0.9357051406546071  
##### LogisticRegression
Best C = 1.734469249164027  
Best solver = lbfgs  
Best F1 = 0.8651748530262853  
##### RandomForest
Best max_depth = 12  
Best n_estimators = 66  
Best F1 = 0.7867848119055804  
##### XGBoost
Best max_depth = 13  
Best n_estimators = 58  
Best F1 = 0.8620380494806515  
##### LightGBM
Best max_depth = 4  
Best n_estimators = 47  
Best F1 = 0.8034221631560704  
##### SVC
Best C = 0.5998126005686496  
Best kernel = linear  
Best F1 = 0.867829947637113  





##### FCN
Best L2_factor = 0.004163987402538657  
Best dropout_factor = 0.04681459826313567  
Best F1 = 0.9357051406546071  
##### LogisticRegression
Best C = 1.734469249164027  
Best solver = lbfgs  
Best F1 = 0.8651748530262853  
##### RandomForest
Best max_depth = 12  
Best n_estimators = 66  
Best F1 = 0.7867848119055804  
##### XGBoost
Best max_depth = 13  
Best n_estimators = 58  
Best F1 = 0.8620380494806515  
##### LightGBM
Best max_depth = 4  
Best n_estimators = 47  
Best F1 = 0.8034221631560704  
##### SVC
Best C = 0.5998126005686496  
Best kernel = linear  
Best F1 = 0.867829947637113 

## 集成

In [32]:
def ensemble_classification_weights(model_hyperparameters_dict, model_xcols, data_df, Y_feature, n_iters=200):
    ensemble_models = list(model_hyperparameters_dict.keys())
    Y_pred_iters = []
    Y_test_iters = []
    f1s = []

    Xs, Ys, X_cols = get_XY(data_df, Y_feature)
    n_samples = Xs.shape[0]

    matrix = np.zeros((len(ensemble_models), len(ensemble_models)))
    for i in tqdm(range(n_iters)):
        train_ind, test_ind, _, _ = train_test_split(np.arange(Xs.shape[0]), np.arange(Xs.shape[0]), test_size=0.20)
        
        Y_train = Ys[train_ind]
        Y_test = Ys[test_ind]
        
        Y_preds = []
        this_f1 = []
        
        for model_label in ensemble_models:
            X_features = model_xcols[model_label]
            Xs, Ys, X_cols = get_XY(data_df, Y_feature, X_features)
            deep_learning = False
            if model_label == 'FCN':
                input_f = Xs.shape[1]
                output_f = 1
                feature_counts = [16, 16, 16, 8]
                deep_learning = True
                model = FCN_model(input_f=input_f, output_f=output_f, feature_counts=feature_counts,
                                  mode='classifier', 
                                  **model_hyperparameters_dict[model_label])
            elif model_label == 'RandomForest':
                model = RandomForestClassifier(**model_hyperparameters_dict[model_label])
            elif model_label == 'XGBoost':
                model = XGBClassifier(**model_hyperparameters_dict[model_label])
            elif model_label == 'LightGBM':
                model = LGBMClassifier(force_col_wise=True, verbose=-1, **model_hyperparameters_dict[model_label])
            elif model_label == 'SVC':
                model = SVC(**model_hyperparameters_dict[model_label])
            elif model_label == 'NuSVC':
                model = NuSVC(**model_hyperparameters_dict[model_label])
            elif model_label == 'LogisticRegression':
                model = LogisticRegression(**model_hyperparameters_dict[model_label])
        
            X_train = Xs[train_ind]
            X_test = Xs[test_ind]

            if deep_learning:
                X_train_dl, X_val, Y_train_dl, Y_val = train_test_split(X_train, Y_train, test_size=0.20)

            
            if deep_learning:
                scaler = StandardScaler()
                X_scaler = scaler.fit(X_train_dl)
                X_train_dl = X_scaler.transform(X_train_dl)
                X_test = X_scaler.transform(X_test)
                X_val = X_scaler.transform(X_val)
                _ = model.fit(X_train_dl, Y_train_dl, X_val, Y_val)
            else:
                scaler = StandardScaler()
                X_scaler = scaler.fit(X_train)
                X_train = X_scaler.transform(X_train)
                X_test = X_scaler.transform(X_test)
                _ = model.fit(X_train, Y_train)

            YP = model.predict(X_test)
            YP[np.where(YP<0.5)] = 0
            YP[np.where(YP>=0.5)] = 1
            Y_preds.append(YP)
            this_f1.append(f1_score(Y_test, YP))

        #print(Y_preds)
        residuals = Y_preds - np.array([Y_test] * len(Y_preds)).reshape(len(Y_preds),-1)
        matrix += cross_correlation_matrix(residuals)

        mean_YP = np.mean(Y_preds, axis=0)
        mean_YP[np.where(mean_YP<0.5)] = 0
        mean_YP[np.where(mean_YP>=0.5)] = 1
        this_f1.append(f1_score(Y_test, mean_YP))
        f1s.append(this_f1)
        Y_pred_iters.append(Y_preds)
        Y_test_iters.append(Y_test)

    matrix = matrix / n_iters
    optimal_weights = sovle_optimal_weights(matrix)

    optimal_weights_dict = {}
    for i, w in enumerate(optimal_weights):
        optimal_weights_dict[ensemble_models[i]] = w

    weighted_f1s = []
    for i in range(n_iters):
        this_pred = np.sum(Y_pred_iters[i] * np.concatenate([optimal_weights.reshape(-1,1),] * Y_test_iters[0].shape[0], axis = 1), axis=0)
        this_pred[np.where(this_pred<0.5)] = 0
        this_pred[np.where(this_pred>=0.5)] = 1
        weighted_f1s.append(f1_score(Y_test_iters[i], this_pred))
    weighted_f1s = np.array(weighted_f1s).reshape(-1, 1)
    array_f1s = np.array(f1s)
    array_f1s = np.concatenate([f1s, weighted_f1s], axis=1)
    
    f1_dict = {'Model': ensemble_models + ['Ensemble', 'Weighted_Ensemble'],
                'Avg F1': list(np.mean(array_f1s, axis=0)), 
                'Std F1': list(np.std(array_f1s, axis=0)),
                '10th percentile': list(np.sort(array_f1s, axis=0)[int(array_f1s.shape[0] * 0.1) - 1])}
    df = pd.DataFrame(f1_dict)
    df = df.sort_values('10th percentile', ascending=False).reset_index(drop=True)
    return df, optimal_weights_dict


In [33]:
if weights_determined[Y_feature]:
    optimal_weights = {
        'FCN': 0.257,
        'LogisticRegression': 0.140,
        'RandomForest': 0.206,
        'XGBoost': 0.182,
        'LightGBM': 0.085,
        'SVC': 0.130
    }
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')
else:
    if 'FCN' in model_hyperparameters_dict.keys():
        n_iters = 50
    else:
        n_iters = 200
    df, optimal_weights= ensemble_classification_weights(model_hyperparameters_dict, model_xcols, weather_power_df, Y_feature, n_iters=n_iters)
    display(df)
    print('Weights:')
    for i, k in enumerate(model_hyperparameters_dict.keys()):
        print(f'{k}: {optimal_weights[k]:.3f}')

this_model_path = train_model_path + f'{Y_feature}/'
os.makedirs(this_model_path, exist_ok=True)

save_model_metadata(Y_feature, this_model_path + 'meta.json', model_xcols, model_hyperparameters_dict, optimal_weights)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [12:29<00:00, 14.99s/it]


Unnamed: 0,Model,Avg F1,Std F1,10th percentile
0,Ensemble,0.865519,0.055901,0.8
1,Weighted_Ensemble,0.858722,0.05099,0.8
2,LogisticRegression,0.855052,0.045831,0.790698
3,XGBoost,0.857315,0.057343,0.777778
4,SVC,0.856837,0.054064,0.765957
5,LightGBM,0.826234,0.060027,0.756757
6,RandomForest,0.832776,0.056265,0.75
7,FCN,0.848015,0.066077,0.740741


Weights:
FCN: 0.257
LogisticRegression: 0.140
RandomForest: 0.206
XGBoost: 0.182
LightGBM: 0.085
SVC: 0.130
