In [1]:
import torch.nn as nn

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# MLP 모델 정의
class MLP(nn.Module):
    def __init__(self, input_size, hidden_layers, output_size):
        super(MLP, self).__init__()
        layers = []
        in_size = input_size
        for h in hidden_layers:
            layers.append(nn.Linear(in_size, h))
            layers.append(nn.ReLU())
            in_size = h
        layers.append(nn.Linear(in_size, output_size))
        self.network = nn.Sequential(*layers)

    def forward(self, x):
        return self.network(x)

In [2]:
import os
import math
import pickle
import numpy as np
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost

from torch.utils.data import DataLoader, TensorDataset

from utils.weather_api import WeatherApi
from utils.common_function import splitData
from enums.enums import Model, Date, Data, Rmse

weatherApi = WeatherApi();
area = 'Swanton_OH'
# Swanton_OH
X, y = weatherApi.get_weather_data_from_excel(area)
X_train, X_test, y_train, y_test = splitData(X, y, 365)

# MinMaxScaler 적용
scaler = MinMaxScaler()
if 'date' in X_train.columns:
    X_train = X_train.drop(columns=['date'])
    X_test = X_test.drop(columns=['date'])
if 'date' in y_train.columns[0]:
    y_train = y_train.drop(columns=[y_train.columns[0]])
    y_test = y_test.drop(columns=[y_test.columns[0]])

# MinMaxScaler 적용
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

date_range = pd.date_range(start='2023-08-01', end='2024-07-30')
date_df = pd.DataFrame(date_range, columns=['date'])


RandomForest 모델 학습

In [27]:
try:
    params = {
        'n_estimators':(100, 101),
        'max_depth' : (16, 17),
        'min_samples_leaf' : (5,6),
        'min_samples_split' : (5,6)
    }

    rf = RandomForestRegressor(random_state=0)
    rf_model = GridSearchCV(estimator=rf, param_grid=params, cv=10, n_jobs=-1)
    rf_model_result = rf_model.fit(X_train_scaled, y_train)

    rf_best_model = rf_model_result.best_estimator_
    rf_predict = rf_best_model.predict(X_test_scaled)

    data_to_save = {
        Model.MODEL.value: rf_best_model,
        Data.TRAIN_INPUT_DATA.value: X_train_scaled,
        Data.TRAIN_OUTPUT_DATA.value: X_test_scaled,
        Data.VALID_INPUT_DATA.value: [],
        Data.VALID_OUTPUT_DATA.value: [],
        Data.TEST_INPUT_DATA.value: y_train,
        Data.TEST_OUTPUT_DATA.value: y_test,
        Data.PREDICTED_OUTPUT_DATA.value: pd.DataFrame(rf_predict, columns=y_test.columns),
        Rmse.BEST_RMSE.value: math.sqrt(mean_squared_error(rf_predict, y_test.to_numpy())),
        Date.DATE.value: date_df
    }

    path = f'result_model/{area}'
    file_path = f'{path}/RF_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")


MLP 모델 학습

In [3]:
try:
    scaler = MinMaxScaler()
    X_train_mlp_scaled = scaler.fit_transform(X_train)
    X_test_mlp_scaled = scaler.transform(X_test)

    X_train_tensor = torch.tensor(X_train_mlp_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
    test_dataset = TensorDataset(torch.tensor(X_test_mlp_scaled, dtype=torch.float32), torch.tensor(y_test.values, dtype=torch.float32))
    test_loader = DataLoader(test_dataset, batch_size=5, shuffle=False)

    min_i_value = 0
    min_j_value = 0
    mlp_best_rmse = 100
    # 최적 모델 구성에서 사용된 데이터 저장 변수 초기화
    best_train_input_data = None
    best_train_output_data = None
    best_valid_input_data = None
    best_valid_output_data = None
    best_test_input_data = None
    best_test_output_data = None
    best_valid_predictions = None
    best_test_predictions = None
    mlp_best_rmse = float('inf')

    moving_valid_rmse = []
    moving_test_rmse = []
    mlp_best_model = None

    kf = KFold(n_splits=10, shuffle=False)
    for fold, (train_index, valid_index) in enumerate(kf.split(X_train_mlp_scaled)):

        # Split the data
        X_train_fold, X_valid_fold = X_train_tensor[train_index], X_train_tensor[valid_index]
        y_train_fold, y_valid_fold = y_train_tensor[train_index], y_train_tensor[valid_index]

        # TensorDataset, DataLoader로 변경
        train_dataset = TensorDataset(X_train_fold, y_train_fold)
        valid_dataset = TensorDataset(X_valid_fold, y_valid_fold)
        train_loader = DataLoader(train_dataset, batch_size=3, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=3, shuffle=False)

        for i in [128, 256, 512]:
            for j in  [4, 8, 16, 32]:
                if(i > j):
                    model = MLP(input_size=len(X_train.columns), hidden_layers=[i, j], output_size=len(y_train.columns))
                    criterion = nn.MSELoss()
                    optimizer = optim.Adam(model.parameters(), lr=0.001)

                    num_epochs = 1000
                    best_loss = float('inf')
                    epochs_no_improve = 0
                    early_stop = False
                    print(f'processing start with fold in {fold}, i in {i} and j in {j}')
                    for epoch in range(num_epochs):
                        model.train()
                        for inputs, labels in train_loader:
                            optimizer.zero_grad()
                            outputs = model(inputs)
                            loss = criterion(outputs, labels)
                            loss.backward()
                            optimizer.step()

                        # Early stopping
                        model.eval()
                        val_loss = 0
                        with torch.no_grad():
                            for inputs, labels in valid_loader:
                                outputs = model(inputs)
                                val_loss += criterion(outputs, labels).item()

                        val_loss /= len(valid_loader)

                        # print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, Val Loss: {val_loss}')

                        # 연속적으로 best_loss보다 개선되지 않는 횟수
                        if val_loss < best_loss:
                            best_loss = val_loss
                            epochs_no_improve = 0
                        else:
                            epochs_no_improve += 1
                        # patience만큼 개선이 없으면 종료
                        if epochs_no_improve >= 50:
                            print(f'Early stopping after {epoch+1} epochs')
                            early_stop = True
                            break
                                # Model evaluation

                    model.eval()
                    valid_predictions = []
                    valid_true_values = []
                    with torch.no_grad():
                        for inputs, labels in valid_loader:
                            outputs = model(inputs)
                            valid_predictions.append(outputs.numpy())
                            valid_true_values.append(labels.numpy())

                    valid_predictions = np.vstack(valid_predictions)
                    valid_true_values = np.vstack(valid_true_values)

                    valid_rmse = mean_squared_error(valid_true_values, valid_predictions, squared=True)
                    moving_valid_rmse.append(valid_rmse)
                    if(mlp_best_rmse > valid_rmse):
                        mlp_best_model = model
                        mlp_best_rmse = valid_rmse
                        test_predictions = []
                        test_true_values = []

                        with torch.no_grad():
                            for inputs, labels in test_loader:
                                outputs = model(inputs)
                                test_predictions.append(outputs.numpy())
                                test_true_values.append(labels.numpy())
                        test_predictions = np.vstack(test_predictions)
                        test_true_values = np.vstack(test_true_values)
                        test_rmse = mean_squared_error(test_true_values, test_predictions, squared=False)
                        best_train_input_data = pd.DataFrame(train_dataset.tensors[0].numpy(), columns=[f'{i}' for i in X.columns])
                        best_train_output_data = pd.DataFrame(train_dataset.tensors[1].numpy(), columns=[f'{i}' for i in y.columns])
                        best_valid_input_data = pd.DataFrame(valid_dataset.tensors[0].numpy(), columns=[f'{i}' for i in X.columns])
                        best_valid_output_data = pd.DataFrame(valid_dataset.tensors[1].numpy(), columns=[f'{i}' for i in y.columns])
                        best_test_input_data = pd.DataFrame(test_dataset.tensors[0].numpy(), columns=[f'{i}' for i in X.columns])
                        best_test_output_data = pd.DataFrame(test_dataset.tensors[1].numpy(), columns=[f'{i}' for i in y.columns])
                        best_test_predictions = pd.DataFrame(test_predictions, columns=[f'{i}' for i in y.columns])

                        print(f'first node : {i}, snd node : {j},  fold : {fold}, valid_rmse : {valid_rmse}, test_rmse : {test_rmse}')
                    print(f'processing end with  fold in {fold} i in {i} and j in {j}')

    data_to_save = {
        Model.MODEL.value: mlp_best_model,
        Data.TRAIN_INPUT_DATA.value: best_train_input_data,
        Data.TRAIN_OUTPUT_DATA.value: best_train_output_data,
        Data.VALID_INPUT_DATA.value: best_valid_input_data,
        Data.VALID_OUTPUT_DATA.value: best_valid_output_data,
        Data.TEST_INPUT_DATA.value: best_test_input_data,
        Data.TEST_OUTPUT_DATA.value: best_test_output_data,
        Data.PREDICTED_OUTPUT_DATA.value: best_test_predictions,
        Rmse.BEST_RMSE: mlp_best_rmse,
        Date.DATE.value: date_df,
    }

    path = f'result_model/{area}'
    file_path = f'{path}/MLP_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")

processing start with fold in 0, i in 128 and j in 4


KeyboardInterrupt: 

Adaboost 모델 학습

In [29]:
try:
    param_grid = {
        'estimator__n_estimators': [50],
        'estimator__learning_rate': [0.1],
        'estimator__estimator__max_depth': [5]
    }

    base_ada = AdaBoostRegressor(estimator=DecisionTreeRegressor())
    fit_model = MultiOutputRegressor(base_ada)

    adaboost_model = GridSearchCV(estimator=fit_model, param_grid=param_grid, cv=10)
    adaboost_model_result = adaboost_model.fit(X_train_scaled, y_train)

    Adaboost_best_model = adaboost_model_result.best_estimator_

    Adaboost_predictions = Adaboost_best_model.predict(X_test)

    # Calculate and print RMSE
    Adaboost_best_rmse = math.sqrt(mean_squared_error(y_test, Adaboost_predictions))

    data_to_save = {
        Model.MODEL.value: Adaboost_best_model,
        Data.TRAIN_INPUT_DATA.value: X_train_scaled,
        Data.TRAIN_OUTPUT_DATA.value: X_test_scaled,
        Data.VALID_INPUT_DATA.value: [],
        Data.VALID_OUTPUT_DATA.value: [],
        Data.TEST_INPUT_DATA.value: y_train,
        Data.TEST_OUTPUT_DATA.value: y_test,
        Data.PREDICTED_OUTPUT_DATA.value: Adaboost_predictions,
        Rmse.BEST_RMSE: Adaboost_best_rmse,
        Date.DATE.value: date_df,
    }

    path = f'result_model/{area}'
    file_path = f'{path}/Adaboost_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")

DecisionTree 모델 학습

In [30]:
try:
    pipe_tree = make_pipeline(DecisionTreeRegressor(random_state=2021))
    # 트리의 파라미터 키값 확인
    pipe_tree.get_params().keys()

    param_range1 = [1, 3, 5, 7, 9, 11]
    param_range2 = [5, 10, 15, 20]
    param_range3 = ['friedman_mse', 'absolute_error', 'poisson', 'squared_error'] # 'explained_variance'도 가능

    param_grid = [{'decisiontreeregressor__max_depth': param_range1,
                'decisiontreeregressor__min_samples_leaf': param_range2,
                'decisiontreeregressor__criterion': param_range3}]

    decisionTree_model = GridSearchCV(
        estimator = pipe_tree,
        param_grid = param_grid,
        scoring = 'neg_mean_squared_error',
        n_jobs= -1,
        cv=10
    )

    decisionTree_model_result = decisionTree_model.fit(X_train_scaled, y_train)
    decisionTree_best_model = decisionTree_model_result.best_estimator_

    decisionTree__pred = decisionTree_best_model.predict(X_test_scaled)

    data_to_save = {
        Model.MODEL.value: decisionTree_best_model,
        Data.TRAIN_INPUT_DATA.value: X_train_scaled,
        Data.TRAIN_OUTPUT_DATA.value: X_test_scaled,
        Data.VALID_INPUT_DATA.value: [],
        Data.VALID_OUTPUT_DATA.value: [],
        Data.TEST_INPUT_DATA.value: y_train,
        Data.TEST_OUTPUT_DATA.value: y_test,
        Data.PREDICTED_OUTPUT_DATA.value: decisionTree__pred,
        Rmse.BEST_RMSE: math.sqrt(mean_squared_error(decisionTree__pred, y_test)),
        Date.DATE.value: date_df,
    }

    path = f'result_model/{area}'
    file_path = f'{path}/DecisionTree_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")

120 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/sw.jin/Documents/projects/wf/workspace/venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/sw.jin/Documents/projects/wf/workspace/venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/sw.jin/Documents/projects/wf/workspace/venv/lib/python3.12/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estim

ExtraTree 모델 학습

In [31]:
try:
    tunning_model = ExtraTreesRegressor()
    gsc = GridSearchCV(
        estimator=tunning_model,
        param_grid={
            'n_estimators': range(100, 200, 100),
            'max_features': range(10,20,10),
            'min_samples_leaf': range(5,10,5),
            'min_samples_split': range(5,10,5),
        },
        cv=10
    )
    extra_best_model_result = gsc.fit(X_train_scaled.to_numpy(), y_train.to_numpy())
    extra_best_model = extra_best_model_result.best_estimator_

    extra_best_prediction = extra_best_model.predict(X_test_scaled.to_numpy())

    data_to_save = {
        Model.MODEL.value: extra_best_model,
        Data.TRAIN_INPUT_DATA.value: X_train_scaled,
        Data.TRAIN_OUTPUT_DATA.value: X_test_scaled,
        Data.VALID_INPUT_DATA.value: [],
        Data.VALID_OUTPUT_DATA.value: [],
        Data.TEST_INPUT_DATA.value: y_train,
        Data.TEST_OUTPUT_DATA.value: y_test,
        Data.PREDICTED_OUTPUT_DATA.value: extra_best_prediction,
        Rmse.BEST_RMSE: math.sqrt(mean_squared_error(extra_best_prediction, y_test)),
        Date.DATE.value: date_df,
    }

    path = f'result_model/{area}'
    file_path = f'{path}/ExtraTree_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")

RMSE':3.407599731908794


GradientBoosting 모델 학습

In [32]:
try:
    gradientBoosting = GradientBoostingRegressor()

    param_grid = {
        'n_estimators': [100, 200, 300, 500],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 3],
        'min_samples_leaf': [1, 2]
    }
    gradientBoosting_model_result = GridSearchCV(
        MultiOutputRegressor(gradientBoosting),
        param_grid,
        scoring='neg_mean_squared_error',
        cv=10,
        n_jobs=-1
    )
    gradientBoosting_model_result.fit(X_train_scaled, y_train)
    gradientBoosting_best_model = gradientBoosting_model_result.best_estimator_

    gradientBoosting_predictions = gradientBoosting_best_model.predict(X_test_scaled)

    # Save the results
    data_to_save = {
        Model.MODEL.value: gradientBoosting_best_model,
        Data.TRAIN_INPUT_DATA.value: X_train_scaled,
        Data.TRAIN_OUTPUT_DATA.value: y_train,
        Data.VALID_INPUT_DATA.value: [],
        Data.VALID_OUTPUT_DATA.value: [],
        Data.TEST_INPUT_DATA.value: X_test_scaled,
        Data.TEST_OUTPUT_DATA.value: y_test,
        Data.PREDICTED_OUTPUT_DATA.value: gradientBoosting_predictions,
        Rmse.BEST_RMSE: math.sqrt(mean_squared_error(y_test, gradientBoosting_predictions)),
        Date.DATE.value: date_df,
    }

    path = f'result_model/{area}'
    file_path = f'{path}/GradientBoosting_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)

except Exception as e:
    print(f"오류 발생: {e}")

Xgboost 모델 학습

In [33]:
try:
    params = {
        'max_depth':[5,7],
        'min_child_weight':[1,3],
        'colsample_bytree':[0.5,0.75]
    }
    tunning_model = xgboost.XGBRegressor()
    xgboost_model = GridSearchCV(
        estimator=tunning_model,
        param_grid=params,
        scoring='r2',
        cv=10
    )
    xgboost_result = xgboost_model.fit(X_train_scaled, y_train)
    xgboost_best_model = xgboost_result.best_estimator_
    xgboost_predictions = xgboost_best_model.predict(X_test_scaled)

    data_to_save = {
        Model.MODEL.value: xgboost_best_model,
        Data.TRAIN_INPUT_DATA.value: X_train_scaled,
        Data.TRAIN_OUTPUT_DATA.value: X_test_scaled,
        Data.VALID_INPUT_DATA.value: [],
        Data.VALID_OUTPUT_DATA.value: [],
        Data.TEST_INPUT_DATA.value: y_train,
        Data.TEST_OUTPUT_DATA.value: y_test,
        Data.PREDICTED_OUTPUT_DATA.value: xgboost_predictions,
        Rmse.BEST_RMSE: math.sqrt(mean_squared_error(xgboost_predictions, y_test)),
        Date.DATE.value: date_df,
    }

    path = f'result_model/{area}'
    file_path = f'{path}/Xgboost_model_with_{area}.pkl'

    os.makedirs(path, exist_ok=True)
    with open(file_path, 'wb') as f:
        pickle.dump(data_to_save, f)
except Exception as e:
    print(f"오류 발생: {e}")