In [25]:
import os
import math
import pickle
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
import xgboost
from itertools import combinations
from torch.utils.data import DataLoader, TensorDataset

from utils.weather_api import WeatherApi
from utils.common_function import splitData
from enums.enums import Model, Date, Data, Rmse
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
import itertools

weatherApi = WeatherApi();
area = 'Billings_MT'
# Swanton_OH
X, y = weatherApi.get_weather_data_from_excel(area)
X_train, X_test, y_train, y_test = splitData(X, y, 365)

# MinMaxScaler 적용
scaler = MinMaxScaler()
if 'date' in X_train.columns:
    X_train = X_train.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])
if 'date' in y_train.columns[0]:
    y_train = y_train.drop(columns=[y_train.columns[0]])
    y_test = y_test.drop(columns=[y_test.columns[0]])

# MinMaxScaler 적용
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

date_range = pd.date_range(start='2023-08-01', end='2024-07-30')
date_df = pd.DataFrame(date_range, columns=['date'])


In [26]:
with open(f'result_model_fold/{area}/MLP_model_with_{area}.pkl', 'rb') as f:
    mlp = pickle.load(f)
mlp_model = mlp[Model.MODEL]

with open(f'result_model_fold/{area}/RF_model_with_{area}.pkl', 'rb') as f:
    rf = pickle.load(f)
rf_model = rf[Model.MODEL]

with open(f'result_model_fold/{area}/ExtraTree_model_with_{area}.pkl', 'rb') as f:
    extra = pickle.load(f)
extra_model = extra[Model.MODEL]

with open(f'result_model_fold/{area}/DecisionTree_model_with_{area}.pkl', 'rb') as f:
    decision = pickle.load(f)
decision_model = decision[Model.MODEL]

with open(f'result_model_fold/{area}/GradientBoosting_model_with_{area}.pkl', 'rb') as f:
    gradient = pickle.load(f)
gradient_model = gradient[Model.MODEL]

with open(f'result_model_fold/{area}/Xgboost_model_with_{area}.pkl', 'rb') as f:
    xgboost_ = pickle.load(f)
xgboost_model = xgboost_[Model.MODEL]

In [27]:
estimators = [
    ('RF', rf_model),
    ('EXTRA', extra_model),
    ('MLP', mlp_model),
    ('DT', decision_model),
    ('XG', xgboost_model)
]
combination_result = []
for r in range(2, len(estimators) + 1):  # 2개부터 모든 모델 조합까지 생성
    combi = list(itertools.combinations(estimators, r))
    combination_result.extend(combi)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_cnt = 0

for subset in combination_result:

    best_valid_rmse = 100
    for train_index, val_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        fold_cnt += 1
        # Stacking 모델 생성
        stacking_regressor = StackingRegressor(
            estimators=list(subset),
            final_estimator=Ridge()
        )
        multi_output_staking_reg = MultiOutputRegressor(stacking_regressor)

        # 모델 학습
        multi_output_staking_reg.fit(X_train_fold, y_train_fold)

        # 예측 및 평가
        y_pred_stack = multi_output_staking_reg.predict(X_val_fold)
        stacking_val_rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred_stack))

        if stacking_val_rmse < best_valid_rmse:
            best_valid_rmse = stacking_val_rmse
            staking_best_model = multi_output_staking_reg
            staking_best_train_input = X_train_fold
            staking_best_train_output = y_train_fold
            staking_best_valid_input = X_val_fold
            staking_best_valid_output = y_val_fold
            print(f'best rmse : {best_valid_rmse}')

    staking_pred = staking_best_model.predict(X_test_scaled)

    data_to_save = {
        Model.MODEL: staking_best_model,
        Data.TRAIN_INPUT_DATA: staking_best_train_input,
        Data.TRAIN_OUTPUT_DATA: staking_best_train_output,
        Data.VALID_INPUT_DATA: staking_best_valid_input,
        Data.VALID_OUTPUT_DATA: staking_best_valid_output,
        Data.TEST_INPUT_DATA: X_test_scaled,
        Data.TEST_OUTPUT_DATA: y_test,
        Data.PREDICTED_OUTPUT_DATA: staking_pred,
        Rmse.BEST_RMSE: best_valid_rmse,
        Date.DATE: date_df,
    }

    # 조합 이름 생성 (모델 이름을 이어붙임)
    model_names = '_'.join([name for name, _ in subset])

    # 경로 및 파일 이름 설정
    path = f'result_model_fold/{area}'
    file_path = f'{path}/Stacking_{model_names}_model_with_{area}.pkl'

    # 디렉토리 생성 및 파일 저장
    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)

    print(f'Model saved to {file_path} with RMSE: {best_valid_rmse}')

best rmse : 4.216872181316406
best rmse : 4.090378873192463
best rmse : 4.078903988002115
Model saved to result_model_fold/Billings_MT/Stacking_RF_EXTRA_model_with_Billings_MT.pkl with RMSE: 4.078903988002115
best rmse : 4.094934497704948
best rmse : 4.013688640784395
best rmse : 3.998672481087394
Model saved to result_model_fold/Billings_MT/Stacking_RF_MLP_model_with_Billings_MT.pkl with RMSE: 3.998672481087394
best rmse : 4.205439138911264
best rmse : 4.076724824576408
best rmse : 4.059485787845395
Model saved to result_model_fold/Billings_MT/Stacking_RF_DT_model_with_Billings_MT.pkl with RMSE: 4.059485787845395
best rmse : 4.061077777673172


KeyboardInterrupt: 

In [None]:
estimators = [
    ('RF', rf_model),
    ('EXTRA', extra_model),
    ('MLP', mlp_model),
    ('DT', decision_model),
    ('XG', xgboost_model)
]
combination_result = []
for r in range(2, len(estimators) + 1):  # 2개부터 모든 모델 조합까지 생성
    combi = list(itertools.combinations(estimators, r))
    combination_result.extend(combi)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_cnt = 0

for subset in combination_result:

    best_valid_rmse = 100
    for train_index, val_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        fold_cnt += 1
        # Stacking 모델 생성
        voting_regressor = VotingRegressor(
            estimators=list(subset)
        )
        multi_output_voting_reg = MultiOutputRegressor(voting_regressor)

        # 모델 학습
        multi_output_voting_reg.fit(X_train_fold, y_train_fold)

        # 예측 및 평가
        y_pred_valid = multi_output_voting_reg.predict(X_val_fold)
        voting_val_rmse = np.sqrt(mean_squared_error(y_val_fold, y_pred_valid))

        if voting_val_rmse < best_valid_rmse:
            best_valid_rmse = voting_val_rmse
            voting_best_model = multi_output_voting_reg
            voting_best_train_input = X_train_fold
            voting_best_train_output = y_train_fold
            voting_best_valid_input = X_val_fold
            voting_best_valid_output = y_val_fold
            print(f'best rmse : {best_valid_rmse}')

    voting_pred = voting_best_model.predict(X_test_scaled)

    data_to_save = {
        Model.MODEL: voting_best_model,
        Data.TRAIN_INPUT_DATA: voting_best_train_input,
        Data.TRAIN_OUTPUT_DATA: voting_best_train_output,
        Data.VALID_INPUT_DATA: voting_best_valid_input,
        Data.VALID_OUTPUT_DATA: voting_best_valid_output,
        Data.TEST_INPUT_DATA: X_test_scaled,
        Data.TEST_OUTPUT_DATA: y_test,
        Data.PREDICTED_OUTPUT_DATA: voting_pred,
        Rmse.BEST_RMSE: best_valid_rmse,
        Date.DATE: date_df,
    }

    # 조합 이름 생성 (모델 이름을 이어붙임)
    model_names = '_'.join([name for name, _ in subset])

    # 경로 및 파일 이름 설정
    path = f'result_model_fold/{area}'
    file_path = f'{path}/Voting_{model_names}_model_with_{area}.pkl'

    # 디렉토리 생성 및 파일 저장
    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)

    print(f'Model saved to {file_path} with RMSE: {best_valid_rmse}')