In [None]:
# 클라우드 런에서 돌려야함

In [19]:
import os
import time
import mlflow
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, KFold, cross_val_predict  
import optuna
from optuna.integration import MLflowCallback
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed = 42

In [3]:
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
# MASE
def mean_absolute_scaled_error(y_true, y_pred):
    n = len(y_true)

    # Calculate MAE of the forecasts
    mae_forecast = np.mean(np.abs(y_true - y_pred))

    # Calculate MAE of the naive forecast
    mae_naive = np.mean(np.abs(np.diff(y_true)))  # Diff calculates y_i - y_{i-1}

    # Ensure denominator is not zero
    if mae_naive == 0:
        return np.inf  # Return infinity if naive MAE is zero

    return mae_forecast / mae_naive

In [5]:
merge_B_ip = pd.read_csv('./data/merge_B_ip.csv', encoding = 'cp949')
merge_C_ip = pd.read_csv('./data/merge_C_ip.csv', encoding = 'cp949')
merge_D_ip = pd.read_csv('./data/merge_D_ip.csv', encoding = 'cp949')
merge_E_ip = pd.read_csv('./data/merge_E_ip.csv', encoding = 'cp949')

In [6]:
df_merged = pd.concat([merge_B_ip,merge_C_ip,merge_D_ip,merge_E_ip])

In [7]:
cols = ['Stem Diameter', 'Petiole Length','Leaf Count', 'Leaf Length', 'Leaf Width', 'Fruit Count',
       'Plant Height', 'Final Inflorescence Order','Inflorescence Flower Count', 'supplyEC', 'supplyPH', 'innerCO2',
       'innerHum', 'innerTemp', 'innerSolar', 'Survey Date']

In [8]:
df_merged = df_merged[cols]

In [9]:
df_merged['Survey Date'] = pd.to_datetime(df_merged['Survey Date'], format='%Y-%m-%d %H:%M')
df_merged.set_index('Survey Date', inplace=True)

In [10]:
df_merged.dropna(inplace=True)

In [11]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 171897 entries, 2023-10-06 00:00:00 to 2024-04-26 00:00:00
Data columns (total 15 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Stem Diameter               171897 non-null  float64
 1   Petiole Length              171897 non-null  float64
 2   Leaf Count                  171897 non-null  float64
 3   Leaf Length                 171897 non-null  float64
 4   Leaf Width                  171897 non-null  float64
 5   Fruit Count                 171897 non-null  int64  
 6   Plant Height                171897 non-null  float64
 7   Final Inflorescence Order   171897 non-null  int64  
 8   Inflorescence Flower Count  171897 non-null  int64  
 9   supplyEC                    171897 non-null  float64
 10  supplyPH                    171897 non-null  float64
 11  innerCO2                    171897 non-null  float64
 12  innerHum                    171897 non

In [12]:
train, test = train_test_split(df_merged, test_size=0.2, shuffle=False)

In [13]:
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

In [14]:
x_train = train[:,9:]
y_train = train[:,:9]
x_test = test[:,9:]
y_test = test[:,:9]

In [None]:
# tuning machine learning models

In [None]:
# bagging regressor

In [None]:
# parameter range from literature reviews
params = {
    'n_estimators': (100,200,500),
    'max_samples' : (0.7, 0.8, 0.9, 1.0)
}

In [None]:
model = BaggingRegressor()

finder = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring='r2',
    refit=True,
    cv=TimeSeriesSplit(n_splits=5),  # change this to the splitter subject to test
    return_train_score=True
    )

finder.fit(X_train, Y_train)

best_params = finder.best_params_
best_score = round(finder.best_score_,4)

In [None]:
# xgb regressor

In [None]:
# parameter range from literature reviews
params = {
    'n_estimators': (50, 100, 150),
    'max_depth': (3, 6, 9),
    'learning_rate':(0.01, 0.1, 0.3, 0.5),
    'gamma' :(5, 7, 10)
}

model = XGBRegressor()

finder = RandomizedSearchCV(
    estimator=model,
    param_distributions=params,
    scoring='r2',
    refit=True,
    cv=TimeSeriesSplit(n_splits=5),  # change this to the splitter subject to test
    return_train_score=True
    )

start_time = time.time()
finder.fit(X_train, Y_train)
print("--- %s seconds ---" %(time.time()- start_time))

best_params = finder.best_params_
best_score = round(finder.best_score_,4)

In [None]:
# --- Set MLflow Tracking URI ---
mlflow.set_tracking_uri("https://spacefarm:HeetsCoffe1!@mlflow-izqyq2ng5q-du.a.run.app")  # Replace with your server details
# http://<username>:<password>@<your_mlflow_server_host>:<port>

def optimize_xgboost(x_train, y_train, n_trials=100, experiment_name="XGBoost_Hyperparam_Tuning", n_splits=5):
    """Optimize XGBoost hyperparameters using Optuna, log results to MLflow server, and register the best model."""

    mlflow.set_experiment(experiment_name)  
    mlflow_callback = MLflowCallback(tracking_uri=mlflow.get_tracking_uri())

    def objective(trial):
        with mlflow.start_run(nested=True):  # Create nested runs for each Optuna trial
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
                # 'max_depth': trial.suggest_int('max_depth', 3, 11, step=2),
                # 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True), # Log scale for learning rate
                # 'gamma': trial.suggest_float('gamma', 0, 2),
                # 'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                # 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            }

            model = xgb.XGBRegressor(**params)
            cv = KFold(n_splits=n_splits, shuffle=False)  # Blocked cross-validation
            y_pred = cross_val_predict(model, x_train, y_train, cv=cv, n_jobs=-1) # <-- Use cross_val_predict

            mse = mean_squared_error(y_train, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_train, y_pred)
            mase = mean_absolute_scaled_error(y_train, y_pred)

            

            # Log params and metrics to MLflow
            mlflow.log_params(params)
            mlflow.log_metrics({
                "mse": mse,
                "rmse": rmse,
                "r2": r2,
                "mase": mase
            })

            return rmse
        
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials, callbacks=[mlflow_callback])

    # Get best parameters and train the final model
    with mlflow.start_run(run_name="best_model"):
        best_params = study.best_params
        best_model = xgb.XGBRegressor(**best_params)
        best_model.fit(x_train, y_train)
        
        y_pred_best = best_model.predict(x_train)
        mse_best = mean_squared_error(y_train, y_pred_best)
        rmse_best = np.sqrt(mse_best)
        r2_best = r2_score(y_train, y_pred_best)
        mase_best = mean_absolute_scaled_error(y_train, y_pred_best)
        
        # Log metrics for the best model
        mlflow.log_params(best_params)
        mlflow.log_metrics({
            "best_mse": mse_best,
            "best_rmse": rmse_best,
            "best_r2": r2_best,
            "best_mase": mase_best
        })

        # Log the best model with MLflow
        mlflow.xgboost.log_model(best_model, "xgboost_model")

        return best_params, study.best_value


# --- Call the function to start the tuning process ---
best_params, best_score = optimize_xgboost(x_train, y_train)

[I 2024-08-07 03:30:05,523] A new study created in memory with name: no-name-5a255508-6cac-4a7d-87c3-c61b1b8ef597
2024/08/07 03:30:22 INFO mlflow.tracking._tracking_service.client: 🏃 View run mercurial-ray-979 at: https://spacefarm:HeetsCoffe1!@mlflow-izqyq2ng5q-du.a.run.app/#/experiments/1/runs/bc22dc6d79bd402db760f6437e091460.
2024/08/07 03:30:22 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://spacefarm:HeetsCoffe1!@mlflow-izqyq2ng5q-du.a.run.app/#/experiments/1.
[I 2024-08-07 03:30:22,762] Trial 0 finished with value: 0.7727764267784701 and parameters: {'n_estimators': 50, 'max_depth': 9, 'learning_rate': 0.18040378993347958, 'gamma': 0.6992627100810342, 'subsample': 0.784844038680522, 'colsample_bytree': 0.9324119781595585}. Best is trial 0 with value: 0.7727764267784701.
2024/08/07 03:30:22 INFO mlflow.tracking.fluent: Experiment with name 'no-name-5a255508-6cac-4a7d-87c3-c61b1b8ef597' does not exist. Creating a new experiment.
2024/08/07 03:30:22 INFO

KeyboardInterrupt: 

In [None]:
# tuning deep learning models