# Best Regression

In [1]:
import os
import time
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping
import keras
import joblib
import warnings
warnings.filterwarnings('ignore')

2024-08-08 01:07:15.854465: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-08 01:07:16.053824: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-08 01:07:16.087117: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-08-08 01:07:16.087131: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudar

In [2]:
seed = 42

In [3]:
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
# MASE
def mean_absolute_scaled_error(y_true, y_pred):
    n = len(y_true)

    # Calculate MAE of the forecasts
    mae_forecast = np.mean(np.abs(y_true - y_pred))

    # Calculate MAE of the naive forecast
    mae_naive = np.mean(np.abs(np.diff(y_true)))  # Diff calculates y_i - y_{i-1}

    # Ensure denominator is not zero
    if mae_naive == 0:
        return np.inf  # Return infinity if naive MAE is zero

    return mae_forecast / mae_naive

In [6]:
merge_B_ip = pd.read_csv('./data/merge_B_ip.csv', encoding = 'cp949')
merge_C_ip = pd.read_csv('./data/merge_C_ip.csv', encoding = 'cp949')
merge_D_ip = pd.read_csv('./data/merge_D_ip.csv', encoding = 'cp949')
merge_E_ip = pd.read_csv('./data/merge_E_ip.csv', encoding = 'cp949')

In [7]:
df_merged = pd.concat([merge_B_ip,merge_C_ip,merge_D_ip,merge_E_ip])

In [8]:
cols = ['Stem Diameter', 'Petiole Length','Leaf Count', 'Leaf Length', 'Leaf Width', 'Fruit Count',
       'Plant Height', 'Final Inflorescence Order','Inflorescence Flower Count', 'supplyEC', 'supplyPH', 'innerCO2',
       'innerHum', 'innerTemp', 'innerSolar', 'Survey Date']

In [9]:
df_merged = df_merged[cols]

In [10]:
df_merged['Survey Date'] = pd.to_datetime(df_merged['Survey Date'], format='%Y-%m-%d %H:%M')
df_merged.set_index('Survey Date', inplace=True)

In [11]:
df_merged.dropna(inplace=True)

In [12]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 171897 entries, 2023-10-06 00:00:00 to 2024-04-26 00:00:00
Data columns (total 15 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Stem Diameter               171897 non-null  float64
 1   Petiole Length              171897 non-null  float64
 2   Leaf Count                  171897 non-null  float64
 3   Leaf Length                 171897 non-null  float64
 4   Leaf Width                  171897 non-null  float64
 5   Fruit Count                 171897 non-null  int64  
 6   Plant Height                171897 non-null  float64
 7   Final Inflorescence Order   171897 non-null  int64  
 8   Inflorescence Flower Count  171897 non-null  int64  
 9   supplyEC                    171897 non-null  float64
 10  supplyPH                    171897 non-null  float64
 11  innerCO2                    171897 non-null  float64
 12  innerHum                    171897 non

## train, validation, test split

In [13]:
train, test = train_test_split(df_merged, test_size=0.2, shuffle=False)

## normalization

In [14]:
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

## x,y split

In [15]:
train.shape, test.shape

((137517, 15), (34380, 15))

In [16]:
x_train = train[:,9:]
y_train = train[:,:9]
x_test = test[:,9:]
y_test = test[:,:9]

In [17]:
x_train.shape, y_train.shape

((137517, 6), (137517, 9))

In [18]:
x_test.shape, y_test.shape

((34380, 6), (34380, 9))

## best model selection (Machine Learning Models)

In [19]:
models = {
    'LinearRegression': LinearRegression(fit_intercept=False),
    'RandomForestRegressor': RandomForestRegressor(n_estimators=278, min_samples_split=3, min_samples_leaf=7, max_features="sqrt", max_depth=21, bootstrap=True),
    'BaggingRegressor': BaggingRegressor(n_estimators=500,max_samples=1.0,max_features=1.0,bootstrap_features=True,bootstrap=False),
    'XGBoostRegressor': XGBRegressor(n_estimators=500,objective="reg:squarederror",subsample=1.0,max_depth=6,learning_rate=0.09176042457246529,colsample_bytree=0.9),
    'ElasticNetRegressor': ElasticNet(tol=0.07427333578580918, max_iter=5804,l1_ratio=0.1008807639223882,fit_intercept=True,alpha=
1.1162875572947195e-05),
    'KNNRegressor': KNeighborsRegressor(weights="distance", p=2,n_neighbors=17,leaf_size=12,algorithm="ball_tree")
}



In [19]:
# MLflow Tracking Setup
mlflow.set_tracking_uri("https://spacefarm:coolguyisyou@mlflow-izqyq2ng5q-du.a.run.app")
mlflow.set_experiment("ML_model_comparison")

# Dictionary to store the performance metrics for each model
metrics = {
    'Model': [],
    'MSE': [],
    'MASE': [],
    'R2': [],
    'RMSE': [],
    'TrainingTime': []
}

# Directory to save model weights
save_model_path = './save_models_all'
os.makedirs(save_model_path, exist_ok=True)  # Create the directory if it does not exist

# Train and evaluate each model
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Record the start time
        start_time = time.time()
    
        # Fit the model
        model.fit(x_train, y_train)
    
        # Record the end time
        end_time = time.time()

        # Calculate training time
        training_time = end_time - start_time

        # Make predictions
        y_pred = model.predict(x_test)

        # Calculate performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mase = mean_absolute_scaled_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)
        
        # Log parameters to MLflow
        mlflow.log_params(params)

        # Log metrics to MLflow
        mlflow.log_metrics({
            "mse": mse,
            "rmse": rmse,
            "r2": r2,
            "mase": mase,
            "training_time": training_time
        })

        # Store the metrics
        metrics['Model'].append(name)
        metrics['MSE'].append(mse)
        metrics['MASE'].append(mase)
        metrics['R2'].append(r2)
        metrics['RMSE'].append(rmse)
        metrics['TrainingTime'].append(training_time)
        
        # Save the model using MLflow
        mlflow.sklearn.log_model(model, artifact_path="models")

        # Print metrics to console
        print(f'{name} - MSE: {mse:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}, MASE: {mase:.4f}, Training Time: {training_time:.2f} seconds')
        

# Convert the metrics dictionary to a DataFrame for a cleaner display
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

# Save the metrics to a CSV file
metrics_df.to_csv('result_all.csv', index=False)

LinearRegression - MSE: 1.167496946890475
LinearRegression - MASE: 0.8037111005905347
LinearRegression - R2: -0.07158540449219493
LinearRegression - RMSE: 1.0805077264371945
LinearRegression - TrainingTime: 0.04133129119873047
Model saved as ./save_models_all/LinearRegression_model.h5
RandomForestRegressor - MSE: 0.7051196035196152
RandomForestRegressor - MASE: 0.596625038082696
RandomForestRegressor - R2: 0.2730233142003069
RandomForestRegressor - RMSE: 0.8397140010263109
RandomForestRegressor - TrainingTime: 68.46667718887329
Model saved as ./save_models_all/RandomForestRegressor_model.h5
BaggingRegressor - MSE: 0.8331340870193027
BaggingRegressor - MASE: 0.5892370046434061
BaggingRegressor - R2: 0.08201124970787269
BaggingRegressor - RMSE: 0.9127617909505759
BaggingRegressor - TrainingTime: 433.8897638320923
Model saved as ./save_models_all/BaggingRegressor_model.h5
XGBoostRegressor - MSE: 0.7843725884591342
XGBoostRegressor - MASE: 0.6352296251834003
XGBoostRegressor - R2: 0.192194

In [109]:
x_train.shape

(137517, 6)

## baseline model selection (deep learning models)

In [114]:
# 1. Model Definitions (with EarlyStopping)
def create_model_1():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(100, activation='tanh', input_shape=(x_train.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(100, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(100, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping  # Return both the model and the callback

def create_model_2():
    model = tf.keras.Sequential([
        tf.keras.layers.SimpleRNN(100, activation='tanh', return_sequences=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.SimpleRNN(100, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping

def create_model_3():
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(100, activation='tanh', return_sequences=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.LSTM(100, activation='tanh'),
        tf.keras.layers.Dense(100, activation='sigmoid'), 
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])  
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping

def create_model_4():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(100, activation='tanh', return_sequences=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GRU(100, activation='tanh'),
        tf.keras.layers.Dense(100, activation='sigmoid'), 
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])  
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping


def reshape_for_rnn(X):
    # Assuming X has shape (num_samples, num_features) for ANN
    num_samples = X.shape[0]
    time_steps = 1  # If each sample is a single time step
    num_features = X.shape[1]
    return X.reshape(num_samples, time_steps, num_features)


# Create KerasRegressor Objects
model_1, early_stopping_1 = create_model_1()
model_1_regressor = KerasRegressor(
    build_fn=lambda: model_1,  epochs=100,  batch_size=32, verbose=0, callbacks=[early_stopping_1] 
)

model_2, early_stopping_2 = create_model_2()
model_2_regressor = KerasRegressor(
    build_fn=lambda: model_2, epochs=100, batch_size=32, verbose=0, callbacks=[early_stopping_2]
)

model_3, early_stopping_3 = create_model_3()
model_3_regressor = KerasRegressor(
    build_fn=lambda: model_3, epochs=100, batch_size=32, verbose=0, callbacks=[early_stopping_3]
)

model_4, early_stopping_4 = create_model_4()
model_4_regressor = KerasRegressor(
    build_fn=lambda: model_4, epochs=100, batch_size=32, verbose=0, callbacks=[early_stopping_4]
)




# Pipeline (Updated)
pipe_1 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', model_1_regressor)
])

    
pipe_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('reshape', FunctionTransformer(reshape_for_rnn)),  # Add reshaping step
    ('model', model_2_regressor)
])

pipe_3 = Pipeline([
    ('scaler', StandardScaler()),
    ('reshape', FunctionTransformer(reshape_for_rnn)), 
    ('model', model_3_regressor)
])

pipe_4 = Pipeline([
    ('scaler', StandardScaler()),
    ('reshape', FunctionTransformer(reshape_for_rnn)), 
    ('model', model_4_regressor)
])


print('pipeline completed')

pipeline completed


In [115]:
# Dictionary to store the performance metrics for each model
metrics = {
    'Model': [],
    'MSE': [],
    'MASE': [],
    'R2': [],
    'RMSE': [],
    'TrainingTime': []
}

In [116]:
# Evaluation (Enhanced)
for pipe, model_name in zip([pipe_1, pipe_2, pipe_3, pipe_4], 
                            ["ANN", "RNN", "LSTM", "GRU"]):
    # Record the start time
    start_time = time.time()
    
    pipe.fit(x_train, y_train, model__validation_split=0.2)  
    
    # Record the end time
    end_time = time.time()
    
    # Calculate training time
    training_time = end_time - start_time
    
    y_pred = pipe.predict(x_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mase = mean_absolute_scaled_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    
    # Store the metrics
    metrics['Model'].append(model_name)
    metrics['MSE'].append(mse)
    metrics['MASE'].append(mase)
    metrics['R2'].append(r2)
    metrics['RMSE'].append(rmse)
    metrics['TrainingTime'].append(training_time)
    
    
    print(f"{model_name} Metrics:")
    print("  MSE:", mse)
    print("  MASE:", mase)
    print("  RMSE:", rmse)
    print("  R-squared:", r2)
    print(f'{model_name} - TrainingTime: {training_time}')
    
    
# Convert the metrics dictionary to a DataFrame for a cleaner display
metrics_deep_learning = pd.DataFrame(metrics)
print(metrics_deep_learning)

# Save the metrics to a CSV file
metrics_deep_learning.to_csv('result_deep_learning.csv', index=False)

ANN Metrics:
  MSE: 1.1208458595037782
  MASE: 0.7913080801793806
  RMSE: 1.0587000800527873
  R-squared: -0.05511522645115138
KNNRegressor - TrainingTime: 78.44186568260193
RNN Metrics:
  MSE: 1.1385370133781725
  MASE: 0.7989043728513088
  RMSE: 1.0670224990027963
  R-squared: -0.07600072034588072
KNNRegressor - TrainingTime: 109.32416939735413
LSTM Metrics:
  MSE: 1.1124911297043947
  MASE: 0.7880598289018206
  RMSE: 1.0547469505546792
  R-squared: -0.052284700872649664
KNNRegressor - TrainingTime: 104.88369822502136
GRU Metrics:
  MSE: 1.1298594894385028
  MASE: 0.7926577396418715
  RMSE: 1.0629484886101033
  R-squared: -0.06514989035315918
KNNRegressor - TrainingTime: 101.07052063941956
  Model       MSE      MASE        R2      RMSE  TrainingTime
0   ANN  1.120846  0.791308 -0.055115  1.058700     78.441866
1   RNN  1.138537  0.798904 -0.076001  1.067022    109.324169
2  LSTM  1.112491  0.788060 -0.052285  1.054747    104.883698
3   GRU  1.129859  0.792658 -0.065150  1.062948    

In [74]:
# 데이터 변환 
X_train = x_train.reshape((-1, 1, 6))
X_test = x_test.reshape((-1, 1, 6))