In [6]:
###  INPUT ###
import pandas as pd
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')
pd.set_option('display.max_columns', 1000)

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


def preprocessing(input_data):
    input_data = input_data.sort_values(["frmDist","date"])
    input_data["date"] = pd.to_datetime(input_data["date"], format='%Y%m%d')

    input_data = input_data.copy()
    input_data["frmYearWeek"] = input_data.date.dt.strftime('%Y-%W')
    input_data["frmYear"] = input_data.date.dt.strftime('%Y')
    input_data["frmWeek"] = input_data.date.dt.strftime('%W').astype(int)
    input_data = input_data.sort_values("frmYearWeek")


    tmp_df = input_data.groupby(["frmDist", 'frmYearWeek']).max().reset_index()

    # 16주 이상 농장만
    frmDist_list = (tmp_df.groupby("frmDist").frmYearWeek.count() >= 16).reset_index()

    tmp_df = pd.merge(frmDist_list.loc[frmDist_list.frmYearWeek,["frmDist"]], tmp_df, on="frmDist")

    start_point = ((tmp_df['outtrn_cumsum'] == 0)&
    (tmp_df['outtrn_cumsum'].shift(-1, fill_value=False) != 0) & 
    (tmp_df['outtrn_cumsum'].shift(1, fill_value=False) == 0))

    x_cols = ['inTp', 'inHd', 'otmsuplyqy', 'acSlrdQy', 'cunt', 
               'ph', 'outTp', 'outWs', 'daysuplyqy', 'inCo2', 'ec', 
               'frtstGrupp', 'lefstalklt', 'frtstSetCo', 'pllnLt',
               'flanGrupp', 'frtstCo', 'flanJnt', 'tcdmt', 'frmhsFclu', 'hvstGrupp',
               'hvstJnt', 'grwtLt', 'fcluHg', 'lefLt', 'flwrCo', 'hvstCo', 'lefCunt',
               'frtstJnt', 'lefBt', 'stemThck', 'frmAr', 'frmDov', 'outtrn_cumsum',
               'WaterUsage', 'WaterCost', 'FertilizerUsage', 'FertilizerCost',
               'CO2Usage', 'CO2Cost', 'MistUsageTime', 'Mist Cost',
               'HeatingEnergyUsage_cumsum'] 


    agg_dict = {i:'mean' for i in x_cols}
    agg_dict["HeatingEnergyUsage_cumsum"] = 'max'
    agg_dict["outtrn_cumsum"] = 'max'
    agg_dict["frmAr"] = 'mean'
    agg_dict["frmDist"] = 'first'
    agg_dict["frmWeek"] = 'first'
    agg_dict["date"] = 'first'

    
    
    result_list = []
    for i in tmp_df[start_point].index:
        sub_df = tmp_df[(i-7):(i+9)]
        if sub_df.shape[0] != 16:
            continue
        result = sub_df.groupby('frmDist').agg(agg_dict)
        result_list.append(result)

    result_df = pd.concat(result_list, ignore_index=True)


    result_df["frmDov*frmAr"] = result_df["frmDov"] * result_df['frmAr']  
    result_df['inHd_inCo2_interaction'] = result_df['inHd'] * result_df['inCo2'] 
    result_df['ph_ec_interaction'] = result_df['ph'] * result_df['ec'] 
    result_df['inTp_range'] = result_df["inTp"].max()-result_df["inTp"].min()
    result_df['inHd_range'] = result_df["inHd"].max()-result_df["inHd"].min()
    
    
    return result_df


# ... (Data preprocessing code here)
# input_data = input_data.drop(columns=['frmDist'])

input_data = preprocessing(input_data)

# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum', "frmDist", "date"]).columns]
Y = input_data[['outtrn_cumsum','HeatingEnergyUsage_cumsum']] 

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# Initialize and train the LinearRegression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict 'y' values using the trained model
y_pred = model.predict(X_test)


# Calculate RMSE between the predictions and actual 'y' values
def calculate_rmse(targets, predictions):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and target values.

    :param predictions: Predicted values.
    :type predictions: array-like
    :param targets: Target values.
    :type targets: array-like
    :return: RMSE value.
    :rtype: float
    """
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))


# Calculate r2_score between the predictions and actual 'y' values
def calculate_R2_score(y_test, y_pred):
    from sklearn.metrics import r2_score
    return r2_score(y_test, y_pred)


rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 507.309028046324
R2_score: 0.9980940391798985
