In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
plant_data = pd.read_csv("/kaggle/input/solar-power-generation-data/Plant_1_Generation_Data.csv")
plant_sensor = pd.read_csv("/kaggle/input/solar-power-generation-data/Plant_1_Weather_Sensor_Data.csv")

In [None]:
plant_data["DATE_TIME"] = pd.to_datetime(plant_data["DATE_TIME"])
plant_sensor["DATE_TIME"] = pd.to_datetime(plant_sensor["DATE_TIME"])

## จัดการกับข้อมูล

In [None]:
# merge two table
df = pd.merge(plant_data, plant_sensor, on=["DATE_TIME"], how="inner")
df = df.drop(columns=["PLANT_ID_x", "PLANT_ID_y", "SOURCE_KEY_y"])

df

In [None]:
df.plot("DATE_TIME", "AC_POWER", style=".")
df.plot("DATE_TIME", "DC_POWER", style=".")
df.plot("DATE_TIME", "TOTAL_YIELD", style=".")

## สร้างการทำนาย
นำทุกส่วนมาเขียนใหม่ในรูปฟังก์ชั่นเพื่อให้ง่ายต่อการใช้งาน
1. รวบรวมวัน
2. เทรน
3. วัดค่า rmse

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, LassoLars

from sklearn.metrics import mean_squared_error as MSE

In [None]:
# group range of day that we interested in (day = 3, 7)
def group_date(day):
    cdf = df.copy()
    date = df["DATE_TIME"]

    for i in range(day):
        col = list(df.columns)
        date = date + np.timedelta64(1, "D")

        new_col = dict()
        for j in col[2:]:
            new_col[j] = j + f"_{i}"

        next_day = df.copy()
        next_day["DATE_TIME"] = date
        next_day = next_day.rename(columns=new_col)

        cdf = pd.merge(cdf, next_day, on=["DATE_TIME", "SOURCE_KEY_x"], how="inner")

    # get rid all nth feature except total_yield
    col = list(cdf.columns)
    col = [i for i in col if (i[-1] != str(day-1)) and (i not in ["DATE_TIME", "SOURCE_KEY_x"])] 
    col.sort()

    ll = ["DATE_TIME", "SOURCE_KEY_x"]
    for i in col:
        if "DAIRY" in i or "TOTAL" in i:
            continue    
        ll.append(i)
    ll.append(f"TOTAL_YIELD_{day-1}")
    
    cdf = cdf[ll]
    
    return cdf

# train model and return record of rmse for all model
def fit_and_evaluate(cdf):
    np.random.seed(281)
    cdf = cdf.to_numpy()
    
    np.random.shuffle(cdf)
    
    LM = LinearRegression()
    R = Ridge(alpha=0.5)
    LL = LassoLars(alpha=0.5)

    num = len(cdf)//10

    x = []
    y = []

    for i in range(len(cdf)):
        x.append(cdf[i][2:-1])
        y.append(cdf[i][-1])

    x = np.array(x)
    y = np.array(y)

    record = pd.DataFrame(dtype=np.float64, columns=["LinearRegression", "Ridge", "LARS Lasso"])

    for i in range(10):
        x_test = x[num*i:num*(i+1)]
        y_test = y[num*i:num*(i+1)]

        x_train = np.concatenate((x[:num*(i-1)], x[num*(i+1):]), axis=0)
        y_train = np.concatenate((y[:num*(i-1)], y[num*(i+1):]), axis=0)

        record.loc[i] = rmse_model(LM, x_train, y_train, x_test, y_test), \
                        rmse_model(R, x_train, y_train, x_test, y_test), \
                        rmse_model(LL, x_train, y_train, x_test, y_test)
    return record

# rmse 
def rmse_model(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    y_hat = model.predict(x_test)
    return MSE(y_test, y_hat) ** 0.5

In [None]:
# main model
a = group_date(3).to_numpy()

x = []
y = []

for i in range(len(a)):
    x.append(a[i][2:-1])
    y.append(a[i][-1])
    
x = np.array(x)
y = np.array(y)
    
LM = LinearRegression()

LM.fit(x[:len(a)//10*9], y[:len(a)//10*9])
y_hat = LM.predict(x[len(a)//10*9:])

MSE(y_hat, y[len(a)//10*9:])**0.5

## คำนวณจาก 3 วันย้อนหลัง

In [None]:
rec = pd.DataFrame(columns=["LinearRegression", "Ridge", "LARS Lasso"])

In [None]:
cdf3 = group_date(3)
rec3 = fit_and_evaluate(cdf3)

print("RMSE FROM PAST 3 DAY")
des = rec3.describe()
rec.loc["3day"] = des.loc["min"]

rec3

จะเห็นได้ว่าโทเดลทั้งสามมีค่า RMSE ใกล้เคียงกันโดย LARS Lasso มีค่าต่ำที่สุด

## คำนวนจาก 7 วันย้อนหลัง

In [None]:
cdf7 = group_date(7)
rec7 = fit_and_evaluate(cdf7)

print("RMSE FROM PAST 7 DAY")
des = rec7.describe()
rec.loc["7day"] = des.loc["min"]

rec7

In [None]:
rec

จะเห็นได้ว่า model ที่มีค่า RMSE น้อยที่สุดของทั้งคู่ คือ LARS Lasso Regression โดยมีค่าใกล้เคียงกันที่ 407473.282325 และ 400180.478359 ตามลำดับ