In [53]:
#  -------------------------------------------


###  INPUT ###
import pandas as pd
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')


import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler


def preprocessing(input_data):
    input_data = input_data.sort_values(["frmDist","date"])
    input_data["date"] = pd.to_datetime(input_data["date"], format='%Y%m%d')

    input_data = input_data.copy()
    input_data["frmYearWeek"] = input_data.date.dt.strftime('%Y-%W')
    input_data = input_data.sort_values("frmYearWeek")


    tmp_df = input_data.groupby(["frmDist", 'frmYearWeek']).max().reset_index()

    # 16주 이상 농장만
    frmDist_list = (tmp_df.groupby("frmDist").frmYearWeek.count() >= 16).reset_index()

    tmp_df = pd.merge(frmDist_list.loc[frmDist_list.frmYearWeek,["frmDist"]], tmp_df, on="frmDist")

    start_point = ((tmp_df['outtrn_cumsum'] == 0)&
    (tmp_df['outtrn_cumsum'].shift(-1, fill_value=False) != 0) & 
    (tmp_df['outtrn_cumsum'].shift(1, fill_value=False) == 0))

    x_cols = ['inTp', 'inHd', 'otmsuplyqy', 'cunt', 'ph', 'outTp', 'outWs', 'daysuplyqy', 'inCo2', 'ec',
    'frtstGrupp', 'lefstalklt', 'frtstSetCo',
    'pllnLt', 'flanGrupp', 'frtstCo', 'flanJnt', 'tcdmt', 'frmhsFclu',
    'hvstGrupp', 'hvstJnt', 'grwtLt', 'fcluHg', 'lefLt', 'flwrCo', 'hvstCo',
    'lefCunt', 'frtstJnt', 'lefBt', 'stemThck', 'frmAr', 'frmDov',
     'WaterUsage', 'WaterCost', 'FertilizerUsage',
    'FertilizerCost', 'CO2Usage', 'CO2Cost', 'MistUsageTime', 'Mist Cost'] 


    agg_dict = {i:'mean' for i in x_cols}
    agg_dict["HeatingEnergyUsage_cumsum"] = 'max'
    agg_dict["outtrn_cumsum"] = 'max'
    agg_dict["frmAr"] = 'mean'
    agg_dict["frmDist"] = 'first'
    agg_dict["date"] = 'first'
    
    


    result_list = []
    for i in tmp_df[start_point].index:
        sub_df = tmp_df[(i-7):(i+9)]
#         display(sub_df.iloc[12:13, :])
        if sub_df.shape[0] != 16:
            print("NONONO", sub_df.shape[0], sub_df.frmDist.first)
            continue
        if sub_df.frmDist.values[0] == "DBSF1673":
            display(sub_df)
        result = sub_df.groupby('frmDist').agg(agg_dict)
        result_list.append(result)

    result_df = pd.concat(result_list, ignore_index=True)
    result_df

    result_df["outtrn_cumsum"] = result_df["outtrn_cumsum"] /  result_df["frmAr"]
    result_df["outtrn_cumsum"] = result_df["outtrn_cumsum"] /  result_df["frmAr"]
    
    return result_df


# ... (Data preprocessing code here)
# input_data = input_data.drop(columns=['frmDist'])

input_data = preprocessing(input_data)

# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum', "frmDist", "date"]).columns]
Y = input_data[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# Initialize and train the LinearRegression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict 'y' values using the trained model
X_test= scaler.transform(X_test)
y_pred = model.predict(X_test)


# Calculate RMSE between the predictions and actual 'y' values
def calculate_rmse(targets, predictions):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and target values.

    :param predictions: Predicted values.
    :type predictions: array-like
    :param targets: Target values.
    :type targets: array-like
    :return: RMSE value.
    :rtype: float
    """
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))


# Calculate r2_score between the predictions and actual 'y' values
def calculate_R2_score(y_test,y_pred):
    from sklearn.metrics import r2_score
    return r2_score(y_test, y_pred)


rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)


Unnamed: 0,frmDist,frmYearWeek,date,inTp,inHd,otmsuplyqy,acSlrdQy,cunt,ph,outTp,...,outtrn_cumsum,WaterUsage,WaterCost,FertilizerUsage,FertilizerCost,CO2Usage,CO2Cost,MistUsageTime,Mist Cost,HeatingEnergyUsage_cumsum
533,DBSF1673,2017-27,2017-07-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
534,DBSF1673,2017-28,2017-07-16,30.952915,102.34631,0.0,2753.437483,0.0,0.0,31.165141,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
535,DBSF1673,2017-29,2017-07-23,35.0052,110.479128,0.0,1955.030068,0.0,0.0,34.566342,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
536,DBSF1673,2017-30,2017-07-30,34.082708,112.268784,0.0,2749.905449,0.0,0.0,31.657987,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
537,DBSF1673,2017-31,2017-08-06,33.188629,103.476438,0.0,2029.831229,0.0,0.0,33.805892,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
538,DBSF1673,2017-32,2017-08-13,32.911052,101.835835,0.0,2526.885336,0.0,0.0,29.660139,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
539,DBSF1673,2017-33,2017-08-20,30.389206,97.599252,0.0,1884.112645,0.0,0.0,27.909882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
540,DBSF1673,2017-34,2017-08-27,26.004795,100.515424,0.0,2583.795008,0.0,0.0,27.933784,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
541,DBSF1673,2017-35,2017-09-03,23.704668,93.614949,0.0,2379.020506,0.0,0.0,25.51404,...,634.2181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
542,DBSF1673,2017-36,2017-09-10,25.266218,100.784515,0.0,1740.371722,0.0,0.0,22.989899,...,1169.791432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


RMSE: 0.32368979755611454
R2_score: 0.996082657242684


In [42]:
X_test

array([[ 0.99797411, -0.41989647, -0.14982733, ...,  0.        ,
         0.        ,  0.        ],
       [-0.13228578,  1.04968369,  1.0228173 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.78679518, -1.11290304, -0.08717546, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.7848288 , -0.73817818, -0.07878011, ...,  0.        ,
         0.        ,  0.        ],
       [-0.14384373,  1.36524958,  2.01156548, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.44175228,  0.04443408, -0.32368606, ...,  0.        ,
         0.        ,  0.        ]])