In [1]:
#  -------------------------------------------


###  INPUT ###
import pandas as pd
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')
pd.set_option('display.max_columns', 1000)

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler


def preprocessing(input_data):
    input_data = input_data.sort_values(["frmDist","date"])
    input_data["date"] = pd.to_datetime(input_data["date"], format='%Y%m%d')

    input_data = input_data.copy()
    input_data["frmYearWeek"] = input_data.date.dt.strftime('%Y-%W')
    input_data["frmYear"] = input_data.date.dt.strftime('%Y')
    input_data["frmWeek"] = input_data.date.dt.strftime('%W').astype(int)
    
    input_data = input_data.sort_values("frmYearWeek")


    tmp_df = input_data.groupby(["frmDist", 'frmYearWeek']).max().reset_index()

    # 16주 이상 농장만
    frmDist_list = (tmp_df.groupby("frmDist").frmYearWeek.count() >= 16).reset_index()

    tmp_df = pd.merge(frmDist_list.loc[frmDist_list.frmYearWeek,["frmDist"]], tmp_df, on="frmDist")

    start_point = ((tmp_df['outtrn_cumsum'] == 0)&
    (tmp_df['outtrn_cumsum'].shift(-1, fill_value=False) != 0) & 
    (tmp_df['outtrn_cumsum'].shift(1, fill_value=False) == 0))

    x_cols = ['inTp', 'inHd', 'otmsuplyqy', 'acSlrdQy', 'cunt', 
               'ph', 'outTp', 'outWs', 'daysuplyqy', 'inCo2', 'ec', 
               'frtstGrupp', 'lefstalklt', 'frtstSetCo', 'pllnLt',
               'flanGrupp', 'frtstCo', 'flanJnt', 'tcdmt', 'frmhsFclu', 'hvstGrupp',
               'hvstJnt', 'grwtLt', 'fcluHg', 'lefLt', 'flwrCo', 'hvstCo', 'lefCunt',
               'frtstJnt', 'lefBt', 'stemThck', 'frmAr', 'frmDov', 'outtrn_cumsum',
               'WaterUsage', 'WaterCost', 'FertilizerUsage', 'FertilizerCost',
               'CO2Usage', 'CO2Cost', 'MistUsageTime', 'Mist Cost',
               'HeatingEnergyUsage_cumsum'] 


    agg_dict = {i:'mean' for i in x_cols}
    agg_dict["HeatingEnergyUsage_cumsum"] = 'max'
    agg_dict["outtrn_cumsum"] = 'max'
    agg_dict["frmAr"] = 'mean'
    agg_dict["frmDist"] = 'first'
    agg_dict["frmWeek"] = 'first'
    agg_dict["date"] = 'first'
    
    result_list = []
    for i in tmp_df[start_point].index:
        sub_df = tmp_df[(i-7):(i+9)]
#         display(sub_df.iloc[12:13, :])
        if sub_df.shape[0] != 16:
#             print("NONONO", sub_df.shape[0], sub_df.frmDist.first)
            continue
#         if sub_df.frmDist.values[0] == "DBSF1673":
#             display(sub_df)
        result = sub_df.groupby('frmDist').agg(agg_dict)
        result_list.append(result)

    result_df = pd.concat(result_list, ignore_index=True)
    


    result_df["outtrn_cumsum"] = result_df["outtrn_cumsum"] /  result_df["frmAr"]
    result_df["outtrn_cumsum"] = result_df["outtrn_cumsum"] /  result_df["frmAr"]
    result_df["frmDov*frmAr"] = result_df["frmDov"] * result_df['frmAr']
    
    
    return result_df


# ... (Data preprocessing code here)
# input_data = input_data.drop(columns=['frmDist'])

input_data = preprocessing(input_data)


# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum', "frmDist", "date"]).columns]
Y = input_data[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42
)

# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)

# Initialize and train the LinearRegression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict 'y' values using the trained model
# X_test= scaler.transform(X_test)
y_pred = model.predict(X_test)


# Calculate RMSE between the predictions and actual 'y' values
def calculate_rmse(targets, predictions):
    """
    Calculate the Root Mean Squared Error (RMSE) between predicted and target values.

    :param predictions: Predicted values.
    :type predictions: array-like
    :param targets: Target values.
    :type targets: array-like
    :return: RMSE value.
    :rtype: float
    """
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))


# Calculate r2_score between the predictions and actual 'y' values
def calculate_R2_score(y_test,y_pred):
    from sklearn.metrics import r2_score
    return r2_score(y_test, y_pred)


rmse = calculate_rmse(y_test, y_pred)
r2score = calculate_R2_score(y_test, y_pred)

### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)


RMSE: 0.00035551693566831486
R2_score: 0.9983620274461023


In [362]:
input_data.frmDist.value_counts()

DBSF5299    2
DBSF5644    2
DBSF4964    2
DBSF8557    2
DBSF8545    2
DBSF8512    2
DBSF5121    2
DBSF5170    2
DBSF5193    2
DBSF5227    2
DBSF8440    2
DBSF5351    2
DBSF8405    2
DBSF8383    2
DBSF5534    2
DBSF5549    2
DBSF5566    2
DBSF5575    2
DBSF5587    2
DBSF4907    2
DBSF4885    2
DBSF8665    2
DBSF4371    2
DBSF4101    2
DBSF4111    2
DBSF4121    2
DBSF4181    2
DBSF4212    2
DBSF4250    2
DBSF4283    2
DBSF4466    2
DBSF8670    2
DBSF4554    2
DBSF8924    2
DBSF8830    2
DBSF8764    2
DBSF4720    2
DBSF4800    2
DBSF4812    2
DBSF5607    2
DBSF5661    2
DBSF4049    2
DBSF5686    2
DBSF6502    2
DBSF7863    2
DBSF7828    2
DBSF7740    2
DBSF6618    2
DBSF6641    2
DBSF7722    2
DBSF7705    2
DBSF6892    2
DBSF7130    2
DBSF7151    2
DBSF7673    2
DBSF7186    2
DBSF7198    2
DBSF7221    2
DBSF7419    2
DBSF7476    2
DBSF7897    2
DBSF6441    2
DBSF6318    2
DBSF8205    2
DBSF8318    2
DBSF8293    2
DBSF5764    2
DBSF5769    2
DBSF5772    2
DBSF8285    2
DBSF8242    2
DBSF60

In [122]:
# RMSE: 0.00035551693566831486
# R2_score: 0.9983620274461023

In [123]:
# pd.DataFrame({"cols":X_train.columns, "coef":model.coef_[0]}).sort_values('coef')

In [124]:
# input_data.hvstGrupp