# Baseline Regression

In [111]:
import os
import time
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from keras.wrappers.scikit_learn import KerasRegressor
from keras.callbacks import EarlyStopping

import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
seed = 42

In [3]:
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [28]:
# MASE
def mean_absolute_scaled_error(y_true, y_pred):
    n = len(y_true)

    # Calculate MAE of the forecasts
    mae_forecast = np.mean(np.abs(y_true - y_pred))

    # Calculate MAE of the naive forecast
    mae_naive = np.mean(np.abs(np.diff(y_true)))  # Diff calculates y_i - y_{i-1}

    # Ensure denominator is not zero
    if mae_naive == 0:
        return np.inf  # Return infinity if naive MAE is zero

    return mae_forecast / mae_naive

In [4]:
merge_B_ip = pd.read_csv('./data/merge_B_ip.csv', encoding = 'cp949')
merge_C_ip = pd.read_csv('./data/merge_C_ip.csv', encoding = 'cp949')
merge_D_ip = pd.read_csv('./data/merge_D_ip.csv', encoding = 'cp949')
merge_E_ip = pd.read_csv('./data/merge_E_ip.csv', encoding = 'cp949')

In [5]:
df_merged = pd.concat([merge_B_ip,merge_C_ip,merge_D_ip,merge_E_ip])

In [6]:
cols = ['Stem Diameter', 'Petiole Length','Leaf Count', 'Leaf Length', 'Leaf Width', 'Fruit Count',
       'Plant Height', 'Final Inflorescence Order','Inflorescence Flower Count', 'supplyEC', 'supplyPH', 'innerCO2',
       'innerHum', 'innerTemp', 'innerSolar', 'Survey Date']

In [7]:
df_merged = df_merged[cols]

In [8]:
df_merged['Survey Date'] = pd.to_datetime(df_merged['Survey Date'], format='%Y-%m-%d %H:%M')
df_merged.set_index('Survey Date', inplace=True)

In [9]:
df_merged.dropna(inplace=True)

In [10]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 171897 entries, 2023-10-06 00:00:00 to 2024-04-26 00:00:00
Data columns (total 15 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Stem Diameter               171897 non-null  float64
 1   Petiole Length              171897 non-null  float64
 2   Leaf Count                  171897 non-null  float64
 3   Leaf Length                 171897 non-null  float64
 4   Leaf Width                  171897 non-null  float64
 5   Fruit Count                 171897 non-null  int64  
 6   Plant Height                171897 non-null  float64
 7   Final Inflorescence Order   171897 non-null  int64  
 8   Inflorescence Flower Count  171897 non-null  int64  
 9   supplyEC                    171897 non-null  float64
 10  supplyPH                    171897 non-null  float64
 11  innerCO2                    171897 non-null  float64
 12  innerHum                    171897 non

## train, validation, test split

In [11]:
train, test = train_test_split(df_merged, test_size=0.2, shuffle=False)

## normalization

In [12]:
scaler = StandardScaler()
train = scaler.fit_transform(train)
test = scaler.transform(test)

## x,y split

In [13]:
train.shape, test.shape

((137517, 15), (34380, 15))

In [19]:
x_train = train[:,9:]
y_train = train[:,:9]
x_test = test[:,9:]
y_test = test[:,:9]

In [21]:
x_train.shape, y_train.shape

((137517, 6), (137517, 9))

In [22]:
x_test.shape, y_test.shape

((34380, 6), (34380, 9))

## baseline model selection (Machine Learning Models)

In [30]:
models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'XGBoostRegressor': XGBRegressor(),
    'ElasticNetRegressor': ElasticNet(),
    'KNNRegressor': KNeighborsRegressor()
}

# Dictionary to store the performance metrics for each model
metrics = {
    'Model': [],
    'MSE': [],
    'MASE': [],
    'R2': [],
    'RMSE': [],
    'TrainingTime': []
}

In [31]:
# Directory to save model weights
save_model_path = './save_models_all'
os.makedirs(save_model_path, exist_ok=True)  # Create the directory if it does not exist

# Train and evaluate each model
for name, model in models.items():
    # Record the start time
    start_time = time.time()
    
    # Fit the model
    model.fit(x_train, y_train)
    
    # Record the end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time

    # Make predictions
    y_pred = model.predict(x_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mase = mean_absolute_scaled_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)

    # Store the metrics
    metrics['Model'].append(name)
    metrics['MSE'].append(mse)
    metrics['MASE'].append(mase)
    metrics['R2'].append(r2)
    metrics['RMSE'].append(rmse)
    metrics['TrainingTime'].append(training_time)

    # Print the metrics
    print(f'{name} - MSE: {mse}')
    print(f'{name} - MASE: {mase}')
    print(f'{name} - R2: {r2}')
    print(f'{name} - RMSE: {rmse}')
    print(f'{name} - TrainingTime: {training_time}')

    # Save the model weights
    model_filename = os.path.join(save_model_path, f'{name}_model.h5')  # Changed extension to .pkl for compatibility
    joblib.dump(model, model_filename)
    print(f'Model saved as {model_filename}')

# Convert the metrics dictionary to a DataFrame for a cleaner display
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

# Save the metrics to a CSV file
metrics_df.to_csv('result_all.csv', index=False)

LinearRegression - MSE: 1.1674969468904748
LinearRegression - MASE: 0.8037111005905345
LinearRegression - R2: -0.07158540449219485
LinearRegression - RMSE: 1.0805077264371943
LinearRegression - TrainingTime: 0.055400848388671875
Model saved as ./save_models_all/LinearRegression_model.h5
RandomForestRegressor - MSE: 0.7523184330678965
RandomForestRegressor - MASE: 0.574745258365784
RandomForestRegressor - R2: 0.18481869657942898
RandomForestRegressor - RMSE: 0.8673629188914502
RandomForestRegressor - TrainingTime: 56.91299891471863
Model saved as ./save_models_all/RandomForestRegressor_model.h5
BaggingRegressor - MSE: 0.7705234214978426
BaggingRegressor - MASE: 0.5806203670658566
BaggingRegressor - R2: 0.16650862182938664
BaggingRegressor - RMSE: 0.877794635149841
BaggingRegressor - TrainingTime: 5.741190433502197
Model saved as ./save_models_all/BaggingRegressor_model.h5
XGBoostRegressor - MSE: 0.810717847454322
XGBoostRegressor - MASE: 0.6480363974830788
XGBoostRegressor - R2: 0.17396

In [109]:
x_train.shape

(137517, 6)

## baseline model selection (deep learning models)

In [None]:
# 1. Model Definitions (with EarlyStopping)
def create_model_1():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(100, activation='tanh', input_shape=(x_train.shape[1],)),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(100, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(100, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping  # Return both the model and the callback

def create_model_2():
    model = tf.keras.Sequential([
        tf.keras.layers.SimpleRNN(100, activation='tanh', return_sequences=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.SimpleRNN(100, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping

def create_model_3():
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(100, activation='tanh', return_sequences=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.LSTM(100, activation='tanh'),
        tf.keras.layers.Dense(100, activation='sigmoid'), 
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])  
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping

def create_model_4():
    model = tf.keras.Sequential([
        tf.keras.layers.GRU(100, activation='tanh', return_sequences=True),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.GRU(100, activation='tanh'),
        tf.keras.layers.Dense(100, activation='sigmoid'), 
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(y_train.shape[1])  
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    return model, early_stopping


def reshape_for_rnn(X):
    # Assuming X has shape (num_samples, num_features) for ANN
    num_samples = X.shape[0]
    time_steps = 1  # If each sample is a single time step
    num_features = X.shape[1]
    return X.reshape(num_samples, time_steps, num_features)


# Create KerasRegressor Objects
model_1, early_stopping_1 = create_model_1()
model_1_regressor = KerasRegressor(
    build_fn=lambda: model_1,  epochs=100,  batch_size=32, verbose=0, callbacks=[early_stopping_1] 
)

model_2, early_stopping_2 = create_model_2()
model_2_regressor = KerasRegressor(
    build_fn=lambda: model_2, epochs=100, batch_size=32, verbose=0, callbacks=[early_stopping_2]
)

model_3, early_stopping_3 = create_model_3()
model_3_regressor = KerasRegressor(
    build_fn=lambda: model_3, epochs=100, batch_size=32, verbose=0, callbacks=[early_stopping_3]
)

model_4, early_stopping_4 = create_model_4()
model_4_regressor = KerasRegressor(
    build_fn=lambda: model_4, epochs=100, batch_size=32, verbose=0, callbacks=[early_stopping_4]
)




# Pipeline (Updated)
pipe_1 = Pipeline([
    ('scaler', StandardScaler()),
    ('model', model_1_regressor)
])

    
pipe_2 = Pipeline([
    ('scaler', StandardScaler()),
    ('reshape', FunctionTransformer(reshape_for_rnn)),  # Add reshaping step
    ('model', model_2_regressor)
])

pipe_3 = Pipeline([
    ('scaler', StandardScaler()),
    ('reshape', FunctionTransformer(reshape_for_rnn)), 
    ('model', model_3_regressor)
])

pipe_4 = Pipeline([
    ('scaler', StandardScaler()),
    ('reshape', FunctionTransformer(reshape_for_rnn)), 
    ('model', model_4_regressor)
])


# Training
pipe_1.fit(x_train, y_train, model__validation_split=0.2)  
pipe_2.fit(x_train, y_train, model__validation_split=0.2)
pipe_3.fit(x_train, y_train, model__validation_split=0.2)
pipe_4.fit(x_train, y_train, model__validation_split=0.2)

print('pipeline completed')

In [113]:
# Evaluation (Enhanced)
for pipe, model_name in zip([pipe_1, pipe_2, pipe_3, pipe_4], 
                            ["ANN", "RNN", "LSTM", "GRU"]):
    y_pred = pipe.predict(x_test)
    mse = mean_squared_error(y_test, y_pred)
    mase = mean_absolute_scaled_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{model_name} Metrics:")
    print("  MSE:", mse)
    print("  MASE:", mase)
    print("  RMSE:", rmse)
    print("  R-squared:", r2)

ANN Metrics:
  MSE: 1.1252588081922852
  MASE: 0.793344423647188
  RMSE: 1.0607821681157188
  R-squared: -0.06233056799849257
RNN Metrics:
  MSE: 1.12076811655544
  MASE: 0.7893657525734016
  RMSE: 1.0586633631874864
  R-squared: -0.068889783542309
LSTM Metrics:
  MSE: 1.1049960433796273
  MASE: 0.7843311443722112
  RMSE: 1.051187920107355
  R-squared: -0.0604281530201431
GRU Metrics:
  MSE: 1.1244375898126866
  MASE: 0.7932428573250438
  RMSE: 1.060395015931651
  R-squared: -0.061995264495718905


In [74]:
# 데이터 변환 
X_train = x_train.reshape((-1, 1, 6))
X_test = x_test.reshape((-1, 1, 6))