In [24]:
import os
import math
import pickle
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
import xgboost

from torch.utils.data import DataLoader, TensorDataset

from utils.weather_api import WeatherApi
from utils.common_function import splitData
from enums.enums import Model, Date, Data, Rmse

weatherApi = WeatherApi();
area = 'Swanton_OH'
# Swanton_OH
X, y = weatherApi.get_weather_data_from_excel(area)
X_train, X_test, y_train, y_test = splitData(X, y, 365)

# MinMaxScaler 적용
scaler = MinMaxScaler()
if 'date' in X_train.columns:
    X_train = X_train.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])
if 'date' in y_train.columns[0]:
    y_train = y_train.drop(columns=[y_train.columns[0]])
    y_test = y_test.drop(columns=[y_test.columns[0]])

# MinMaxScaler 적용
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

date_range = pd.date_range(start='2023-08-01', end='2024-07-30')
date_df = pd.DataFrame(date_range, columns=['date'])


RandomForest 모델 학습

In [None]:
# Hyperparameters
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_leaf': [3, 5, 7, 9],
    'min_samples_split': [2, 3, 5, 7, 9]  # Note: Split value should be >=2
}

best_rmse = 100
best_model = None
best_params = None
kf = KFold(n_splits=5, shuffle=True, random_state=0)  # 10-Fold
fold_cnt = 0
for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    fold_cnt += 1
    for n_estimators in param_grid['n_estimators']:
        for max_depth in param_grid['max_depth']:
            for min_samples_leaf in param_grid['min_samples_leaf']:
                for min_samples_split in param_grid['min_samples_split']:
                    print(f'fold : {fold_cnt}, n_estimators : {n_estimators}, max_depth : {max_depth}, min_samples_leaf : {min_samples_leaf}, min_samples_split : {min_samples_split}')

                    # Model initialization
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        min_samples_leaf=min_samples_leaf,
                        min_samples_split=min_samples_split,
                        random_state=42,
                    )

                    # Model fitting
                    rf.fit(X_train_fold, y_train_fold)

                    # Predict on validation fold
                    y_val_pred = rf.predict(X_val_fold)

                    # Calculate RMSE
                    valid_rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)

                    # Average RMSE for current parameters

                    # Check if this is the best model so far
                    if valid_rmse < best_rmse:
                        best_rmse = valid_rmse
                        print(f'valid_rmse : {valid_rmse}')
                        rf_best_rmse = valid_rmse
                        rf_best_model = rf
                        rf_best_params = {
                            'n_estimators': n_estimators,
                            'max_depth': max_depth,
                            'min_samples_leaf': min_samples_leaf,
                            'min_samples_split': min_samples_split
                        }
                        print(f'best parameter : {rf_best_params}')
                        rf_best_x_train = X_train_fold
                        rf_best_x_valid = X_val_fold
                        rf_best_y_train = y_train_fold
                        rf_best_y_valid = y_val_fold


# Using the best model to predict on the test set
rf_predict = rf_best_model.predict(X_test_scaled)

data_to_save = {
    Model.MODEL: rf_best_model,
    Model.BEST_PARAMETER: rf_best_params,
    Data.TRAIN_INPUT_DATA: rf_best_x_train,
    Data.TRAIN_OUTPUT_DATA: rf_best_y_train,
    Data.VALID_INPUT_DATA: rf_best_x_valid,
    Data.VALID_OUTPUT_DATA: rf_best_y_valid,
    Data.TEST_INPUT_DATA: X_test_scaled,
    Data.TEST_OUTPUT_DATA: y_test,
    Data.PREDICTED_OUTPUT_DATA:rf_predict,
    Date.DATE: date_df
}

path = f'result_model_fold/{area}'
file_path = f'{path}/RF_model_with_{area}.pkl'

os.makedirs(path, exist_ok=True)
with open(file_path, 'wb') as f:
    pickle.dump(data_to_save, f)


fold 0
fold : 1, n_estimators : 100, max_depth : 5, min_samples_leaf : 3, min_samples_split : 2
valid_rmse : 3.51334346798637
best parameter : {'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2}
fold : 1, n_estimators : 100, max_depth : 5, min_samples_leaf : 3, min_samples_split : 3
fold : 1, n_estimators : 100, max_depth : 5, min_samples_leaf : 3, min_samples_split : 5
fold : 1, n_estimators : 100, max_depth : 5, min_samples_leaf : 3, min_samples_split : 7
fold : 1, n_estimators : 100, max_depth : 5, min_samples_leaf : 3, min_samples_split : 9
valid_rmse : 3.5132219385950343
best parameter : {'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 9}
fold : 1, n_estimators : 100, max_depth : 5, min_samples_leaf : 5, min_samples_split : 2
valid_rmse : 3.512236143090663
best parameter : {'n_estimators': 100, 'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 2}
fold : 1, n_estimators : 100, max_depth : 5, min_samples_lea

MLP 모델 학습

In [None]:
min_i_value = 0
min_j_value = 0
mlp_best_rmse = 100

best_train_input_data = None
best_train_output_data = None
best_valid_input_data = None
best_valid_output_data = None
best_test_input_data = None
best_test_output_data = None
best_valid_predictions = None
best_test_predictions = None

mlp_best_model = None
fold_cnt = 0
kf = KFold(n_splits=5, shuffle=False)
for fold, (train_index, valid_index) in enumerate(kf.split(X_train_scaled)):
    X_train_fold, X_valid_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    fold_cnt += 1
    for i in [64, 128, 256, 512]:
        for j in [4, 8, 16, 32, 63]:
            if i > j:
                print(f'fold : {fold_cnt}, first_layer : {i}, second_layer : {j}')
                model = MLPRegressor(hidden_layer_sizes=(i, j), max_iter=1000, early_stopping=True, random_state=42)
                model.fit(X_train_fold, y_train_fold)

                valid_predictions = model.predict(X_valid_fold)
                valid_rmse = mean_squared_error(y_valid_fold, valid_predictions, squared=False)

                if mlp_best_rmse > valid_rmse:
                    mlp_best_model = model
                    mlp_best_rmse = valid_rmse
                    test_predictions = model.predict(X_test_scaled)
                    test_rmse = mean_squared_error(y_test, test_predictions, squared=False)

                    mlp_best_train_input_data = X_train_fold
                    mlp_best_train_output_data = y_train_fold
                    mlp_best_valid_input_data = X_valid_fold
                    mlp_best_valid_output_data = y_valid_fold

mlp_best_test_predictions = mlp_best_model.predict(X_test_scaled)

data_to_save = {
    Model.MODEL: mlp_best_model,
    Data.TRAIN_INPUT_DATA: mlp_best_train_input_data,
    Data.TRAIN_OUTPUT_DATA: mlp_best_train_output_data,
    Data.VALID_INPUT_DATA: mlp_best_valid_input_data,
    Data.VALID_OUTPUT_DATA: mlp_best_valid_output_data,
    Data.TEST_INPUT_DATA: X_test_scaled,
    Data.TEST_OUTPUT_DATA: y_test,
    Data.PREDICTED_OUTPUT_DATA: mlp_best_test_predictions,
    Date.DATE: date_df,
}

path = f'result_model_fold/{area}'
file_path = f'{path}/MLP_model_with_{area}.pkl'

os.makedirs(path, exist_ok=True)
with open(file_path, 'wb') as f:
    pickle.dump(data_to_save, f)

fold : 1, first_layer : 64, second_layer : 4
fold : 1, first_layer : 64, second_layer : 8
fold : 1, first_layer : 64, second_layer : 16
fold : 1, first_layer : 64, second_layer : 32
fold : 1, first_layer : 64, second_layer : 63
fold : 1, first_layer : 128, second_layer : 4
fold : 1, first_layer : 128, second_layer : 8
fold : 1, first_layer : 128, second_layer : 16
fold : 1, first_layer : 128, second_layer : 32
fold : 1, first_layer : 128, second_layer : 63
fold : 1, first_layer : 256, second_layer : 4
fold : 1, first_layer : 256, second_layer : 8
fold : 1, first_layer : 256, second_layer : 16
fold : 1, first_layer : 256, second_layer : 32
fold : 1, first_layer : 256, second_layer : 63
fold : 1, first_layer : 512, second_layer : 4
fold : 1, first_layer : 512, second_layer : 8
fold : 1, first_layer : 512, second_layer : 16
fold : 1, first_layer : 512, second_layer : 32
fold : 1, first_layer : 512, second_layer : 63
fold : 2, first_layer : 64, second_layer : 4
fold : 2, first_layer : 64, 



fold : 4, first_layer : 64, second_layer : 8
fold : 4, first_layer : 64, second_layer : 16
fold : 4, first_layer : 64, second_layer : 32
fold : 4, first_layer : 64, second_layer : 63
fold : 4, first_layer : 128, second_layer : 4
fold : 4, first_layer : 128, second_layer : 8
fold : 4, first_layer : 128, second_layer : 16
fold : 4, first_layer : 128, second_layer : 32
fold : 4, first_layer : 128, second_layer : 63
fold : 4, first_layer : 256, second_layer : 4
fold : 4, first_layer : 256, second_layer : 8
fold : 4, first_layer : 256, second_layer : 16
fold : 4, first_layer : 256, second_layer : 32
fold : 4, first_layer : 256, second_layer : 63
fold : 4, first_layer : 512, second_layer : 4
fold : 4, first_layer : 512, second_layer : 8
fold : 4, first_layer : 512, second_layer : 16
fold : 4, first_layer : 512, second_layer : 32
fold : 4, first_layer : 512, second_layer : 63
fold : 5, first_layer : 64, second_layer : 4
fold : 5, first_layer : 64, second_layer : 8
fold : 5, first_layer : 64, 

Adaboost 모델 학습

In [None]:

# # AdaBoost 기반 추정기로 RandomForestRegressor 사용
# base_tree = RandomForestRegressor()

# param_grid = {
#     'estimator__n_estimators': [100],
#     'estimator__learning_rate': [0.7],
#     'estimator__estimator__max_depth': [4, 8]
# }

# # AdaBoost에 기반 추정기로 DecisionTree 설정
# base_ada = AdaBoostRegressor(estimator=base_tree)
# fit_model = MultiOutputRegressor(base_ada)

# adaboost_model = GridSearchCV(estimator=fit_model, param_grid=param_grid, cv=10)
# adaboost_model_result = adaboost_model.fit(X_train_scaled, y_train)

# Adaboost_best_model = adaboost_model_result.best_estimator_

# Adaboost_predictions = Adaboost_best_model.predict(X_test)

# # Calculate and print RMSE
# Adaboost_best_rmse = math.sqrt(mean_squared_error(y_test, Adaboost_predictions))

# data_to_save = {
#     Model.MODEL.value: Adaboost_best_model,
#     Data.TRAIN_INPUT_DATA.value: X_train_scaled,
#     Data.TRAIN_OUTPUT_DATA.value: X_test_scaled,
#     Data.VALID_INPUT_DATA.value: [],
#     Data.VALID_OUTPUT_DATA.value: [],
#     Data.TEST_INPUT_DATA.value: y_train,
#     Data.TEST_OUTPUT_DATA.value: y_test,
#     Data.PREDICTED_OUTPUT_DATA.value: Adaboost_predictions,
#     Rmse.BEST_RMSE: Adaboost_best_rmse,
#     Date.DATE.value: date_df,
# }

# path = f'result_model/{area}'
# file_path = f'{path}/Adaboost_model_with_{area}.pkl'

# os.makedirs(path, exist_ok=True)
# with open(file_path, 'wb') as f:
#     pickle.dump(data_to_save, f)
# # except Exception as e:
# #     print(f"오류 발생: {e}")

DecisionTree 모델 학습

In [None]:
param_range1 = [2, 4, 8, 16, 32, 64]
param_range2 = [5, 10, 15, 20]
param_range3 = ['squared_error']

best_rmse = 100
decisionTree_best_model = None
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_cnt = 0
for train_index, val_index in kf.split(X_train_scaled):
    X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    fold_cnt += 1
    for max_depth in param_range1:
        for min_samples_leaf in param_range2:
            for criterion in param_range3:
                print(f'fold : {fold_cnt}, max_depth : {max_depth}, min_samples_leaf : {min_samples_leaf}, criterion : {criterion}')
                # 모델 생성
                model = DecisionTreeRegressor(
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    criterion=criterion,
                    random_state=42
                )

                model.fit(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                val_rmse = math.sqrt(mean_squared_error(y_val_fold, y_val_pred))

                if val_rmse < best_rmse:
                    best_rmse = val_rmse
                    decisionTree_best_model = model
                    decisionTree_best_train_input = X_train_fold
                    decisionTree_best_train_output = y_train_fold
                    decisionTree_best_valid_input = X_val_fold
                    decisionTree_best_valid_output = y_val_fold
                    decisionTree_best_param = {
                        'max_depth': max_depth,
                        'min_samples_leaf': min_samples_leaf,
                        'criterion': criterion
                    }
                    print(f'best rmse : {best_rmse}, best param : {decisionTree_best_param}')

# 테스트 세트에 대한 예측
decisionTree_pred = decisionTree_best_model.predict(X_test_scaled)

data_to_save = {
    Model.MODEL: decisionTree_best_model,
    Model.BEST_PARAMETER: decisionTree_best_param,
    Data.TRAIN_INPUT_DATA: decisionTree_best_train_input,
    Data.TRAIN_OUTPUT_DATA: decisionTree_best_train_output,
    Data.VALID_INPUT_DATA: decisionTree_best_valid_input,
    Data.VALID_OUTPUT_DATA: decisionTree_best_valid_output,
    Data.TEST_INPUT_DATA: X_test_scaled,
    Data.TEST_OUTPUT_DATA: y_test,
    Data.PREDICTED_OUTPUT_DATA: decisionTree_pred,
    Date.DATE: date_df,
}

path = f'result_model_fold/{area}'
file_path = f'{path}/DecisionTree_model_with_{area}.pkl'

os.makedirs(path, exist_ok=True)
with open(file_path, 'wb') as f:
    pickle.dump(data_to_save, f)

fold : 1, max_depth : 2, min_samples_leaf : 5, criterion : squared_error
best rmse : 5.662327173926173, best param : {'max_depth': 2, 'min_samples_leaf': 5, 'criterion': 'squared_error'}
fold : 1, max_depth : 2, min_samples_leaf : 10, criterion : squared_error
fold : 1, max_depth : 2, min_samples_leaf : 15, criterion : squared_error
fold : 1, max_depth : 2, min_samples_leaf : 20, criterion : squared_error
fold : 1, max_depth : 4, min_samples_leaf : 5, criterion : squared_error
best rmse : 4.7555328557846295, best param : {'max_depth': 4, 'min_samples_leaf': 5, 'criterion': 'squared_error'}
fold : 1, max_depth : 4, min_samples_leaf : 10, criterion : squared_error
fold : 1, max_depth : 4, min_samples_leaf : 15, criterion : squared_error
fold : 1, max_depth : 4, min_samples_leaf : 20, criterion : squared_error
fold : 1, max_depth : 8, min_samples_leaf : 5, criterion : squared_error
best rmse : 4.339781877668118, best param : {'max_depth': 8, 'min_samples_leaf': 5, 'criterion': 'squared_er

ExtraTree 모델 학습

In [25]:
try:
    # 파라미터 범위 정의
    param_range_n_estimators = [100, 200, 300]
    param_range_max_depth = [3, 5, 7]
    param_range_min_samples_leaf = [1, 2, 5, 10]
    param_range_min_samples_split = [2, 5, 10, 20]

    best_rmse = 100
    extra_best_model = None
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_cnt = 0
    for train_index, val_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        fold_cnt += 1
        for n_estimators in param_range_n_estimators:
            for max_depth in param_range_max_depth:
                for min_samples_leaf in param_range_min_samples_leaf:
                    for min_samples_split in param_range_min_samples_split:
                        # 모델 생성
                        print(f'fold : {fold_cnt}, n_estimators : {n_estimators}, max_depth : {max_depth}, min_samples_leaf : {min_samples_leaf}, min_samples_split : {min_samples_split}')
                        model = ExtraTreesRegressor(
                            n_estimators=n_estimators,
                            max_depth=max_depth,
                            min_samples_leaf=min_samples_leaf,
                            min_samples_split=min_samples_split,
                            random_state=42
                        )

                        # 모델 학습
                        model.fit(X_train_fold, y_train_fold)

                        # 검증 데이터에 대해 예측 수행
                        y_val_pred = model.predict(X_val_fold)
                        val_rmse = mean_squared_error(y_val_fold, y_val_pred, squared=False)

                        # 가장 성능이 좋은 모델 저장
                        if val_rmse < best_rmse:
                            best_rmse = val_rmse
                            extra_best_model = model
                            extra_best_train_input = X_train_fold
                            extra_best_train_output = y_train_fold
                            extra_best_valid_input = X_val_fold
                            extra_best_valid_output = y_val_fold
                            extra_best_param = {
                                'n_estimators': n_estimators,
                                'max_depth': max_depth,
                                'min_samples_leaf': min_samples_leaf,
                                'min_samples_split': min_samples_split
                            }
                            print(f'best rmse : {best_rmse}, best param : {extra_best_param}')

    # 테스트 세트에 대한 예측
    extra_best_prediction = extra_best_model.predict(X_test_scaled.to_numpy())


    data_to_save = {
        Model.MODEL: extra_best_model,
        Model.BEST_PARAMETER: extra_best_param,
        Data.TRAIN_INPUT_DATA: extra_best_train_input,
        Data.TRAIN_OUTPUT_DATA: extra_best_train_output,
        Data.VALID_INPUT_DATA: extra_best_valid_input,
        Data.VALID_OUTPUT_DATA: extra_best_valid_output,
        Data.TEST_INPUT_DATA: X_test_scaled,
        Data.TEST_OUTPUT_DATA: y_test,
        Data.PREDICTED_OUTPUT_DATA: extra_best_prediction,
        Date.DATE: date_df,
    }

    path = f'result_model_fold/{area}'
    file_path = f'{path}/ExtraTree_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")

fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 1, min_samples_split : 2
best rmse : 4.397273952583422, best param : {'n_estimators': 100, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 1, min_samples_split : 5
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 1, min_samples_split : 10
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 1, min_samples_split : 20
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 2, min_samples_split : 2
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 2, min_samples_split : 5
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 2, min_samples_split : 10
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 2, min_samples_split : 20
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 5, min_samples_split : 2
fold : 1, n_estimators : 100, max_depth : 3, min_samples_leaf : 5,



GradientBoosting 모델 학습

In [29]:
try:

    best_rmse = 100
    gradientBoosting_best_model = None
    gradientBoosting_predictions = None

    n_estimators_list = [100, 200]
    max_depth_list = [3, 5, 7]
    min_samples_split_list = [2, 5]
    min_samples_leaf_list = [1, 2, 4, 6]

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_cnt = 0
    for train_index, val_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        fold_cnt += 1
        for n_estimators in n_estimators_list:
            for max_depth in max_depth_list:
                for min_samples_split in min_samples_split_list:
                    for min_samples_leaf in min_samples_leaf_list:
                        print(f'fold : {fold_cnt}, n_estimators : {n_estimators}, max_depth : {max_depth}, min_samples_split : {min_samples_split}, min_samples_leaf : {min_samples_leaf}')
                        model = MultiOutputRegressor(
                                GradientBoostingRegressor(
                                n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_samples_split=min_samples_split,
                                min_samples_leaf=min_samples_leaf
                            )
                        )

                        # 모델 학습
                        model.fit(X_train_fold, y_train_fold)

                        # 검증 데이터로 예측
                        val_predictions = model.predict(X_val_fold)
                        rmse = math.sqrt(mean_squared_error(y_val_fold, val_predictions))

                        # RMSE가 가장 작은 모델을 저장
                        if rmse < best_rmse:
                            best_rmse = rmse
                            gradientBoosting_best_train_input = X_train_fold
                            gradientBoosting_best_train_output = y_train_fold
                            gradientBoosting_best_valid_input = X_val_fold
                            gradientBoosting_best_valid_output = y_val_fold
                            gradientBoosting_best_model = model
                            gradientBoosting_best_param = {
                                'n_estimators': n_estimators,
                                'max_depth': max_depth,
                                'min_samples_split': min_samples_split,
                                'min_samples_leaf': min_samples_leaf
                            }
                            print(f'best rmse : {best_rmse}, best param : {gradientBoosting_best_param}')

    # 테스트 데이터로 예측 수행
    gradientBoosting_predictions = gradientBoosting_best_model.predict(X_test_scaled)


    # Save the results
    data_to_save = {
        Model.MODEL: gradientBoosting_best_model,
        Model.BEST_PARAMETER: gradientBoosting_best_param,
        Data.TRAIN_INPUT_DATA: gradientBoosting_best_train_input,
        Data.TRAIN_OUTPUT_DATA: gradientBoosting_best_train_output,
        Data.VALID_INPUT_DATA: gradientBoosting_best_valid_input,
        Data.VALID_OUTPUT_DATA: gradientBoosting_best_valid_output,
        Data.TEST_INPUT_DATA: X_test_scaled,
        Data.TEST_OUTPUT_DATA: y_test,
        Data.PREDICTED_OUTPUT_DATA: gradientBoosting_predictions,
        Date.DATE: date_df,
    }

    path = f'result_model_fold/{area}'
    file_path = f'{path}/GradientBoosting_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)

except Exception as e:
    print(f"오류 발생: {e}")

fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 2, min_samples_leaf : 1
best rmse : 3.0288973981390805, best param : {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 1}
fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 2, min_samples_leaf : 2
best rmse : 3.0249611300651083, best param : {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 2}
fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 2, min_samples_leaf : 4
best rmse : 3.008250607447053, best param : {'n_estimators': 100, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 4}
fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 2, min_samples_leaf : 6
fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 5, min_samples_leaf : 1
fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 5, min_samples_leaf : 2
fold : 1, n_estimators : 100, max_depth : 3, min_samples_split : 5, min_sampl

Xgboost 모델 학습

In [27]:
try:
    n_estimators_list = [100, 200, 300]
    max_depth_list = [3, 5, 7, 9]
    min_child_weight_list = [1, 2, 3, 5]
    learning_rate_list = [0.01, 0.1, 0.2, 0.3, 0.5]
    subsample_list = [0.5, 0.6, 0.7, 0.8, 0.9]

    tunning_model = xgboost.XGBRegressor()

    # KFold를 사용한 교차 검증 설정
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    best_rmse = float("inf")
    best_model = None
    fold_cnt = 0
    # 하이퍼파라미터 조합에 대한 반복문
    for train_index, val_index in kf.split(X_train_scaled):
        X_train_fold, X_val_fold = X_train_scaled.iloc[train_index], X_train_scaled.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        fold_cnt += 1
        for n_estimators in n_estimators_list:
            for max_depth in max_depth_list:
                for min_child_weight in min_child_weight_list:
                    for learning_rate in learning_rate_list:
                        for subsample in subsample_list:
                            print(f'fold : {fold_cnt}, n_estimators : {n_estimators}, max_depth : {max_depth}, min_child_weight : {min_child_weight}, learning_rate : {learning_rate}, subsample : {subsample}')
                            model = xgboost.XGBRegressor(
                                n_estimators=n_estimators,
                                max_depth=max_depth,
                                min_child_weight=min_child_weight,
                                learning_rate=learning_rate,
                                subsample=subsample,
                                random_state=42
                            )

                            model.fit(X_train_fold, y_train_fold)
                            val_predictions = model.predict(X_val_fold)
                            valid_rmse = mean_squared_error(y_val_fold, val_predictions, squared=False)

                            # 최적의 모델 저장
                            if valid_rmse < best_rmse:
                                best_rmse = valid_rmse
                                best_model = model
                                xgboost_best_train_input = X_train_fold
                                xgboost_best_train_output = y_train_fold
                                xgboost_best_valid_input = X_val_fold
                                xgboost_best_valid_output = y_val_fold
                                xgboost_best_model = model
                                xgboost_best_param = {
                                    'n_estimators': n_estimators,
                                    'max_depth': max_depth,
                                    'min_child_weight': min_child_weight,
                                    'learning_rate': learning_rate,
                                    'subsample': subsample
                                }
                                print(f'best rmse : {best_rmse}, best param : {xgboost_best_param}')


    # 최적 모델로 테스트 데이터 예측
    xgboost_predictions = best_model.predict(X_test_scaled)

    data_to_save = {
        Model.MODEL: xgboost_best_model,
        Model.BEST_PARAMETER: xgboost_best_param,
        Data.TRAIN_INPUT_DATA: xgboost_best_train_input,
        Data.TRAIN_OUTPUT_DATA: xgboost_best_train_output,
        Data.VALID_INPUT_DATA: xgboost_best_valid_input,
        Data.VALID_OUTPUT_DATA: xgboost_best_valid_output,
        Data.TEST_INPUT_DATA: X_test_scaled,
        Data.TEST_OUTPUT_DATA: y_test,
        Data.PREDICTED_OUTPUT_DATA: xgboost_predictions,
        Date.DATE: date_df,
    }

    path = f'result_model_fold/{area}'
    file_path = f'{path}/Xgboost_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")

fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate : 0.01, subsample : 0.5
best rmse : 5.64080273534715, best param : {'n_estimators': 100, 'max_depth': 3, 'min_child_weight': 1, 'learning_rate': 0.01, 'subsample': 0.5}
fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate : 0.01, subsample : 0.6
fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate : 0.01, subsample : 0.7
fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate : 0.01, subsample : 0.8
fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate : 0.01, subsample : 0.9
fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate : 0.1, subsample : 0.5
best rmse : 2.851791205269709, best param : {'n_estimators': 100, 'max_depth': 3, 'min_child_weight': 1, 'learning_rate': 0.1, 'subsample': 0.5}
fold : 1, n_estimators : 100, max_depth : 3, min_child_weight : 1, learning_rate 