In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, normalize

from sklearn.ensemble import RandomForestRegressor

data = pd.read_csv('/content/drive/MyDrive/preliminary/main/2023_smartFarm_AI_hackathon_dataset.csv')

In [3]:
# select important columns & organize as a dirctory
select_dict = {}
for i in range(len(data)):
  try:
    row = data.loc[i, ['frmDist', 'date', 'outtrn_cumsum', 'HeatingEnergyUsage_cumsum']]
  except:
    continue
  farm = row.values[0]
  if farm not in select_dict:
    select_dict[farm] = row.values[1:]
  else:
    select_dict[farm] = np.vstack([select_dict[farm], row.values[1:]])

In [4]:
# Clean 'HeatingUsage' value
tmp_dict = {}
for d in select_dict:
  v = select_dict[d][:, -1]
  if list(v).count(0) < len(v) * 0.3:
    tmp_dict[d] = select_dict[d]
select_dict = tmp_dict

In [5]:
# interpolation (fill the missing data)
for d in select_dict:
  tmp = []
  for i in select_dict[d][:, 1]:
    if i == 0:
      tmp.append(np.nan)
    else:
      tmp.append(i)
  tmp = pd.Series(tmp)
  tmp = tmp.interpolate()

  select_dict[d][:, 1] = tmp

  tmp = []
  for i in select_dict[d][:, 2]:
    if i == 0:
      tmp.append(np.nan)
    else:
      tmp.append(i)
  tmp = pd.Series(tmp)
  tmp = tmp.interpolate()

  select_dict[d][:, 2] = tmp

In [6]:
# Finalize Dataset
data = []
target = []

for d in select_dict:
  for i, v in enumerate(select_dict[d][:, 2]):
    if np.isnan(v):
      select_dict[d][:,2][i] = 0.
  data.append(select_dict[d][:, 2])
  target.append(select_dict[d][:, 1][-1])

scaler = MinMaxScaler()
scaler.fit_transform(data)
target = target / np.max(target)

In [7]:
# model
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2score = r2_score(y_test, y_pred)

### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 0.05639216080856112
R2_score: 0.9223517125071607
