In [141]:
# %pip install scikit-learn xgboost
# %pip install torch
# %pip install keras
# %pip install tensorflow
# %pip install xgboost
# %pip install pandas
# %pip install joblib

In [142]:
# from supervised.automl import AutoML
# from sklearn.datasets import make_moons
# import os  # Import os to handle directories
# from keras.models import load_model
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.decomposition import PCA

# 회귀 MODEL
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

import random
import torch
import time
import joblib  # Import joblib for saving the models


import warnings
warnings.filterwarnings('ignore')

In [143]:
# Seed Value Setting
seed = len("TEAMSPACEFARM_AND_BUSAN")

# Fixing Python Random Seed
random.seed(seed)

# Fixing NumPy Random Seed
np.random.seed(seed)

# Fixing PyTorch, CUDA, and cuDNN Random Seed
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(seed)

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [144]:
# # MASE
# def mean_absolute_scaled_error(y_true, y_pred):
#     n = len(y_true)

#     # Calculate MAE of the forecasts
#     mae_forecast = np.mean(np.abs(y_true - y_pred))

#     # Calculate MAE of the naive forecast
#     mae_naive = np.mean(np.abs(np.diff(y_true)))  # Diff calculates y_i - y_{i-1}

#     # Ensure denominator is not zero
#     if mae_naive == 0:
#         return np.inf  # Return infinity if naive MAE is zero

#     return mae_forecast / mae_naive

In [145]:
import numpy as np

def mean_absolute_scaled_error(y_true, y_pred):
    """
    Calculates the Mean Absolute Scaled Error (MASE).

    Args:
        y_true (np.array): The true time series values.
        y_pred (np.array): The predicted time series values.

    Returns:
        float: The MASE value.
    """
    n = len(y_true)
    
    # Calculate MAE of the forecasts
    mae_forecast = np.mean(np.abs(y_true - y_pred))
    
    # Calculate MAE of the naive forecast
    mae_naive = np.mean(np.abs(np.diff(y_true)))  # Diff calculates y_i - y_{i-1}

    # Ensure denominator is not zero
    if mae_naive == 0:
        return np.inf  # Return infinity if naive MAE is zero

    return mae_forecast / mae_naive


In [146]:
train_data = pd.read_csv('miribul-2024-main/data/environmentsB.csv', index_col = 0)

# 시간 타입 변경
# data['datetime'] = pd.to_datetime(data['datetime'], format='%Y-%m-%d %H:%M')

In [147]:
# 데이터 타입 형식 확인
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4740 entries, B농가 to B농가
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    4740 non-null   object 
 1   supplyEC    4740 non-null   float64
 2   supplyPH    4740 non-null   float64
 3   innerCO2    4740 non-null   float64
 4   innerHum    4740 non-null   float64
 5   innerTemp   4740 non-null   float64
 6   innerSolar  4740 non-null   float64
dtypes: float64(6), object(1)
memory usage: 296.2+ KB


In [148]:
# 널 값 확인
train_data.isnull()

Unnamed: 0_level_0,datetime,supplyEC,supplyPH,innerCO2,innerHum,innerTemp,innerSolar
farm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False
B농가,False,False,False,False,False,False,False


In [149]:
# 널값 드랍
train_data = train_data.dropna()

In [150]:
# 행 확인
print(train_data.shape)

(4740, 7)


In [151]:
# 데이터 컬럼 확인
print(train_data.columns)

Index(['datetime', 'supplyEC', 'supplyPH', 'innerCO2', 'innerHum', 'innerTemp',
       'innerSolar'],
      dtype='object')


In [175]:
# 불필요한 컬럼 드랍 => 사용할 컬럼만 추가해서 사용
# x = train_data.drop(['Stem Diameter'], axis=1)

x = train_data[['supplyPH', 'innerCO2', 'supplyEC']]
y = train_data[['innerHum', 'innerTemp']]



In [176]:
x.shape
# (137537, 6)

(4740, 3)

In [177]:
x.columns

Index(['supplyPH', 'innerCO2', 'supplyEC'], dtype='object')

In [178]:
x.head(10)

Unnamed: 0_level_0,supplyPH,innerCO2,supplyEC
farm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B농가,6.0,511.6,0.4
B농가,6.0,517.83,1.0
B농가,6.0,526.83,1.0
B농가,6.0,532.67,1.0
B농가,6.0,540.17,1.0
B농가,6.0,548.33,1.0
B농가,6.0,543.17,1.0
B농가,6.0,543.33,1.0
B농가,6.0,544.0,1.0
B농가,6.0,546.5,1.0


In [179]:
# y = train_data[['Stem Diameter',
#        'Petiole Length', 'Leaf Count', 'Leaf Length', 'Leaf Width', 'Fruit Count',
#        'Plant Height', 'Final Inflorescence Order', 'Inflorescence Flower Count']]

In [180]:
y.shape
# (137537, 9)

(4740, 2)

In [181]:
y.columns
# Index(['Stem Diameter', 'Petiole Length', 'Leaf Count', 'Leaf Length',
#        'Leaf Width', 'Fruit Count', 'Plant Height',
#        'Final Inflorescence Order', 'Inflorescence Flower Count'],
#       dtype='object')

Index(['innerHum', 'innerTemp'], dtype='object')

In [182]:
y.head(10)

Unnamed: 0_level_0,innerHum,innerTemp
farm,Unnamed: 1_level_1,Unnamed: 2_level_1
B농가,92.2,18.0
B농가,94.67,17.67
B농가,97.67,16.67
B농가,98.17,16.0
B농가,97.5,15.33
B농가,97.33,14.67
B농가,97.0,15.0
B농가,97.5,15.0
B농가,96.83,15.0
B농가,97.33,14.67


In [183]:
print(x.shape, y.shape)
# (171897, 52) (171897, 9)

(4740, 3) (4740, 2)


In [184]:
# x.isnull()
x.isnull().sum()
# Live Injections    364
# Num                364
# supplyEC             0
# supplyPH             0
# innerCO2             0
# innerHum             0
# innerTemp            0
# innerSolar           0
# dtype: int64

supplyPH    0
innerCO2    0
supplyEC    0
dtype: int64

In [185]:
y.isnull().sum()
# Stem Diameter                 364
# Petiole Length                364
# Leaf Count                    364
# Leaf Length                   364
# Leaf Width                    364
# Fruit Count                   364
# Plant Height                  364
# Final Inflorescence Order     364
# Inflorescence Flower Count    364
# Facility ID                     0
# Survey Date                     0
# dtype: int64

innerHum     0
innerTemp    0
dtype: int64

In [186]:
# x = x.dropna()
# y = y.dropna()

In [187]:
x.isnull().sum()
# Live Injections    0
# Num                0
# supplyEC           0
# supplyPH           0
# innerCO2           0
# innerHum           0
# innerTemp          0
# innerSolar         0
# dtype: int64

supplyPH    0
innerCO2    0
supplyEC    0
dtype: int64

In [188]:
y.isnull().sum()
# Stem Diameter                 0
# Petiole Length                0
# Leaf Count                    0
# Leaf Length                   0
# Leaf Width                    0
# Fruit Count                   0
# Plant Height                  0
# Final Inflorescence Order     0
# Inflorescence Flower Count    0
# Facility ID                   0
# Survey Date                   0
# dtype: int64

innerHum     0
innerTemp    0
dtype: int64

In [189]:
print(x.shape, y.shape)
# (137537, 6) (137537, 9)

(4740, 3) (4740, 2)


In [190]:
y.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 171897 entries, 2023-10-06 00:00:00 to 2024-04-26 00:00:00
# Data columns (total 9 columns):
#  #   Column                      Non-Null Count   Dtype  
# ---  ------                      --------------   -----  
#  0   Stem Diameter               171897 non-null  float64
#  1   Petiole Length              171897 non-null  float64
#  2   Leaf Count                  171897 non-null  float64
#  3   Leaf Length                 171897 non-null  float64
#  4   Leaf Width                  171897 non-null  float64
#  5   Fruit Count                 171897 non-null  int64  
#  6   Plant Height                171897 non-null  float64
#  7   Final Inflorescence Order   171897 non-null  int64  
#  8   Inflorescence Flower Count  171897 non-null  int64  
# dtypes: float64(6), int64(3)
# memory usage: 13.1+ MB

<class 'pandas.core.frame.DataFrame'>
Index: 4740 entries, B농가 to B농가
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   innerHum   4740 non-null   float64
 1   innerTemp  4740 non-null   float64
dtypes: float64(2)
memory usage: 111.1+ KB


In [191]:
y.info()
# <class 'pandas.core.frame.DataFrame'>
# Index: 171897 entries, 2023-10-06 00:00:00 to 2024-04-26 00:00:00
# Data columns (total 9 columns):
#  #   Column                      Non-Null Count   Dtype  
# ---  ------                      --------------   -----  
#  0   Stem Diameter               171897 non-null  float64
#  1   Petiole Length              171897 non-null  float64
#  2   Leaf Count                  171897 non-null  float64
#  3   Leaf Length                 171897 non-null  float64
#  4   Leaf Width                  171897 non-null  float64
#  5   Fruit Count                 171897 non-null  int64  
#  6   Plant Height                171897 non-null  float64
#  7   Final Inflorescence Order   171897 non-null  int64  
#  8   Inflorescence Flower Count  171897 non-null  int64  
# dtypes: float64(6), int64(3)
# memory usage: 13.1+ MB


<class 'pandas.core.frame.DataFrame'>
Index: 4740 entries, B농가 to B농가
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   innerHum   4740 non-null   float64
 1   innerTemp  4740 non-null   float64
dtypes: float64(2)
memory usage: 111.1+ KB


In [192]:
x.columns, y.columns
# (Index(['supplyEC', 'supplyPH', 'innerCO2', 'innerHum', 'innerTemp',
#         'innerSolar'],
#        dtype='object'),
#  Index(['Stem Diameter', 'Petiole Length', 'Leaf Count', 'Leaf Length',
#         'Leaf Width', 'Fruit Count', 'Plant Height',
#         'Final Inflorescence Order', 'Inflorescence Flower Count'],
#        dtype='object'))

(Index(['supplyPH', 'innerCO2', 'supplyEC'], dtype='object'),
 Index(['innerHum', 'innerTemp'], dtype='object'))

In [193]:
X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3
)

In [194]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# ((82522, 6), (82522, 9), (55015, 6), (55015, 9))

((3318, 3), (3318, 2), (1422, 3), (1422, 2))

In [195]:
# Scaling the data
scaler = StandardScaler()
x_train = scaler.fit_transform(X_train)
x_test = scaler.transform(X_test)

In [196]:
str_time = time.time()

In [198]:
# Initialize individual regressors
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'Bagging Regressor': BaggingRegressor(n_estimators=10, random_state=42),
    'XGBoost Regressor': XGBRegressor(n_estimators=100, random_state=42),
    'ElasticNet Regressor': ElasticNet(random_state=42),
    'K-Nearest Neighbors Regressor': KNeighborsRegressor()
}

# Dictionary to store the performance metrics for each model
metrics = {
    'Model': [],
    'MSE': [],
    'MASE': [],
    'R2': [],
    'RMSE': [],
    'Training Time': []
}

# Directory to save model weights
save_model_path = './save_models_all'
os.makedirs(save_model_path, exist_ok=True)  # Create the directory if it does not exist

# Train and evaluate each model
for name, model in models.items():
    # Record the start time
    start_time = time.time()
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Record the end time
    end_time = time.time()

    # Calculate training time
    training_time = end_time - start_time

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mase = mean_absolute_scaled_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mse)

    # Store the metrics
    metrics['Model'].append(name)
    metrics['MSE'].append(mse)
    metrics['MASE'].append(mase)
    metrics['R2'].append(r2)
    metrics['RMSE'].append(rmse)
    metrics['Training Time'].append(training_time)

    # Print the metrics
    print(f'{name} - MSE: {mse}')
    print(f'{name} - MASE: {mase}')
    print(f'{name} - R2: {r2}')
    print(f'{name} - RMSE: {rmse}')
    print(f'{name} - Training Time: {training_time}')

    # Save the model weights
    model_filename = os.path.join(save_model_path, f'{name}_model.h5')  # Changed extension to .pkl for compatibility
    joblib.dump(model, model_filename)
    print(f'Model saved as {model_filename}')

# Convert the metrics dictionary to a DataFrame for a cleaner display
metrics_df = pd.DataFrame(metrics)
print(metrics_df)

# Save the metrics to a CSV file
metrics_df.to_csv('result_all.csv', index=False)

Linear Regression - MSE: 158.0711696660862
Linear Regression - MASE: 0.12257865520531233
Linear Regression - R2: -0.0019070527968546025
Linear Regression - RMSE: 12.572635748564666
Linear Regression - Training Time: 0.001316070556640625
Model saved as ./save_models_all/Linear Regression_model.h5
Random Forest Regressor - MSE: 135.87182718978562
Random Forest Regressor - MASE: 0.10079159901038327
Random Forest Regressor - R2: 0.16230752261724124
Random Forest Regressor - RMSE: 11.656407130406247
Random Forest Regressor - Training Time: 0.27129602432250977
Model saved as ./save_models_all/Random Forest Regressor_model.h5
Bagging Regressor - MSE: 139.43492339431322
Bagging Regressor - MASE: 0.10163971938293039
Bagging Regressor - R2: 0.14357620839012142
Bagging Regressor - RMSE: 11.808256577256154
Bagging Regressor - Training Time: 0.026384830474853516
Model saved as ./save_models_all/Bagging Regressor_model.h5
XGBoost Regressor - MSE: 117.73843535338376
XGBoost Regressor - MASE: 0.095114