In [1]:
#  -------------------------------------------
###  INPUT ###
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

pd.set_option('display.max_columns', 1000)
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv')

def preprocessing(input_data):
    input_data = input_data.sort_values(["frmDist","date"])
    input_data["date"] = pd.to_datetime(input_data["date"], format='%Y%m%d')

    input_data = input_data.copy()
    input_data["frmYearWeek"] = input_data.date.dt.strftime('%Y-%W')
    input_data["frmYear"] = input_data.date.dt.strftime('%Y')
    input_data["frmWeek"] = input_data.date.dt.strftime('%W').astype(int)
    input_data = input_data.sort_values("frmYearWeek")


    dd = input_data.groupby(['frmDist', 'frmYearWeek']).agg(
        Y=('outtrn_cumsum', 'max'),
        Sun_Min=('acSlrdQy', 'min'),
        Sun_Max=('acSlrdQy', 'max'),
        Sun_Mean=('acSlrdQy', 'mean'),
        INTP_Min=('inTp', 'min'),
        INTP_Max=('inTp', 'max'),
        INTP_Mean=('inTp', 'mean'),
        OUTTP_Min=('outTp', 'min'),
        OUTTP_Max=('outTp', 'max'),
        OUTTP_Mean=('outTp', 'mean'),
        minINHD=('inHd', 'min'),
        maxINHD=('inHd', 'max'),
        meanINHD=('inHd', 'mean'),
        minCO2=('inCo2', 'min'),
        maxCO2=('inCo2', 'max'),
        meanCO2=('inCo2', 'mean'),
        minlC=('lefCunt', 'min'),
        maxlC=('lefCunt', 'max'),
        meanlC=('lefCunt', 'mean'),
        minLT=('lefLt', 'min'),
        maxLT=('lefLt', 'max'),
        meanLT=('lefLt', 'mean'),
        minBT=('lefBt', 'min'),
        maxBT=('lefBt', 'max'),
        meanBT=('lefBt', 'mean'),
        meanFrmAr=('frmAr', 'mean'),
        meanFRM=('frmDov', 'mean')
    ).reset_index()
    dd['meanFRMMul'] = dd['meanFrmAr'] * dd['meanFRM']
    dd_unique = dd.groupby('frmDist').agg(nn=('frmYearWeek', 'size')).reset_index()
    dd_unique_list = dd_unique.query('nn >= 16')
    # Filtering based on dd_unique_list
    dd = dd[dd['frmDist'].isin(dd_unique_list['frmDist'])]

    res = []
    for frmDist in dd['frmDist'].unique():
        tmp = dd[dd['frmDist'] == frmDist]
        arr = tmp['Y'].values
        start = (arr != 0).argmax()
        real = tmp.iloc[(start - 8) : (start + 8)]

        arr = real['Y'].values
        for j in range(9, 15):
            if arr[j] == 0:
                arr[j] = (arr[j - 1] + arr[j + 1]) / 2
        Y = arr[-1]
        X = real.iloc[15, 3:].values
        dataset = pd.DataFrame({'Y': [Y], 'X1':[X[0]], 'X2': [X[1]], 'X3': [X[2]], 'X4': [X[3]], 'X5': [X[4]], 'X6': [X[5]], 'X7': [X[6]],
                                'X8': [X[7]], 'X9': [X[8]], 'X10': [X[9]], 'X11': [X[10]], 'X12': [X[11]], 'X13': [X[12]], 'X14': [X[13]],
                                'X15': [X[14]],'X16': [X[15]],'X17': [X[16]],'X18': [X[17]], 'X19': [X[18]], 'X20': [X[19]], 'X21': [X[20]],
                                'X22': [X[21]],'X23': [X[22]],'X24': [X[23]],'X25': [X[24]], 'X26': [X[25]], 'X27': [X[26]],
                                'Y1' : [arr[8]], 'Y2' : [arr[9]], 'Y3' : [arr[10]], 'Y4' : [arr[11]], 'Y5' : [arr[12]], 'Y6' : [arr[13]], 'Y7' : [arr[14]]})
        res.append(dataset)

    result = pd.concat(res, ignore_index=True)

    # Filtering based on 'Y' column
    result = result[result['Y'] >= 10]
    
    return result


# ... (Data preprocessing code here)

input_data = preprocessing(input_data)
# Split the data into training and testing sets
X = input_data[input_data.drop(columns=['Y']).columns]
Y = input_data[['Y']]

'''
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.1, random_state=42
)
'''
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# Initialize and train the LinearRegression model


model = lgb.LGBMRegressor(n_estimators=1000, random_state=0)
model.fit(X, Y.values.ravel())

# Predict 'y' values using the trained model
y_pred = model.predict(X)

# Predict 'y' values using the trained model
# X_test= scaler.transform(X_test)
y_pred = model.predict(X)


# Calculate RMSE between the predictions and actual 'y' values
def calculate_rmse(targets, predictions):
    from sklearn.metrics import mean_squared_error
    return np.sqrt(mean_squared_error(targets, predictions))



# Calculate r2_score between the predictions and actual 'y' values
def calculate_R2_score(y_test,y_pred):
    from sklearn.metrics import r2_score
    return r2_score(y_test, y_pred)

rmse = calculate_rmse(Y, y_pred)
r2score = calculate_R2_score(Y, y_pred)

### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 5.579184255381722
R2_score: 0.9999997177682968
