In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [5]:
df1 = pd.read_csv("complete data/ACME(Ramnagar) 250MW Complete Data with Metadata.csv")
df2 = pd.read_csv("complete data/ARINSUN SOLAR (BARSAITADESH) 250MW Complete Data.csv")
df3 = pd.read_csv("complete data/Dadri Solar 5MW Complete Data.csv")
df4 = pd.read_csv("complete data/MAHINDRA SOLAR (BADWAR) 250MW Complete Data.csv")
df5 = pd.read_csv("complete data/Unchahar Solar 10MW Complete Data.csv")

In [None]:
# seperating 7 dats of data for forcasting perposes
for df in [df1, df2, df3, df4, df5]:
    api_data = df[df["Month"] == 12][df["Day"] >= 24]
    api_data.to_csv(f"API data/daily/{df["Station"].iloc[0]}_daily_api_data.csv", index=False)

In [6]:
df1.shape

(351, 17)

In [7]:
train_df = pd.DataFrame()
test_df = pd.DataFrame()
for df in [df1, df2, df3, df4, df5]:
    train = df[df['Month'] != 12]
    test = df[df['Month'] == 12]
    train_df = pd.concat([train_df, train])
    test_df = pd.concat([test_df, test])
train_df.shape, test_df.shape

((1606, 17), (150, 17))

In [8]:
for station in train_df["Station"].unique():
    fig = px.line(
        train_df[train_df["Station"] == station],
        y="Actual Generation(Mu)", 
        color="Month",
        title=f"Actual Generation for {station}",
        )
    fig.show()

#### Preprocessing

In [9]:
X_train = train_df.drop(columns=["Actual Generation(Mu)", "Station"])
y_train = train_df["Actual Generation(Mu)"]

X_test = test_df.drop(columns=["Actual Generation(Mu)", "Station"])
y_test = test_df["Actual Generation(Mu)"]

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [11]:
models = {
    "linear" : LinearRegression(),
    "SVR" : SVR(),
    "knn" : KNeighborsRegressor(),
    "decision_tree" : DecisionTreeRegressor(),
    "Random_Forest" : RandomForestRegressor(),
    "XGB" : XGBRegressor()
}

In [12]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{name} - MSE: {mse}, MAE: {mae}, R^2: {r2}")

linear - MSE: 0.0252303541686637, MAE: 0.11264726505399686, R^2: 0.937185781860651
SVR - MSE: 0.011447930773180933, MAE: 0.0817098190643382, R^2: 0.9714989010450011
knn - MSE: 0.013242453333333334, MAE: 0.0756, R^2: 0.9670312058713283
decision_tree - MSE: 0.01853366666666666, MAE: 0.08413333333333332, R^2: 0.9538580483991819
Random_Forest - MSE: 0.011926501976085361, MAE: 0.06903198571428568, R^2: 0.970307436361886
XGB - MSE: 0.011374106765077462, MAE: 0.06655444178740338, R^2: 0.9716826954268768


In [13]:
RFR = RandomForestRegressor()
RFR.fit(X_train, y_train)

In [14]:
for station in test_df["Station"].unique():
    station_test_df = test_df[test_df["Station"] == station]
    station_y_test = station_test_df["Actual Generation(Mu)"]
    station_y_pred = RFR.predict(station_test_df.drop(columns=["Actual Generation(Mu)", "Station"]))
    
    mse = mean_squared_error(station_y_test, station_y_pred)
    mae = mean_absolute_error(station_y_test, station_y_pred)
    r2 = r2_score(station_y_test, station_y_pred)
    
    print(f"{station} - MSE: {mse}, MAE: {mae}, R^2: {r2}")
    fig = go.Figure()
    fig.add_trace(go.Scatter(y=station_y_test, mode='lines', name='Actual'))
    fig.add_trace(go.Scatter(y=station_y_pred, mode='lines', name='Predicted'))
    fig.show()

ACME(Ramnagar) - MSE: 0.016050375566470158, MAE: 0.09821371428571443, R^2: 0.8687407951711632


ARINSUN SOLAR (BARSAITADESH) - MSE: 0.021099462978807642, MAE: 0.10992034126984133, R^2: 0.8261461537320305


DADRI SOLAR - MSE: 2.333966666666666e-05, MAE: 0.0033699999999999932, R^2: 0.21911895910780688


MAHINDRA SOLAR (BADWAR) - MSE: 0.01902205329626793, MAE: 0.1036195158730159, R^2: 0.8478554901670304


UNCHAHAR SOLAR - MSE: 4.66953333333333e-05, MAE: 0.005906666666666668, R^2: 0.530962053571429


#### Export Model

In [15]:
import joblib

In [16]:
#train the model on the maximum data possible
test_df2 = test_df[test_df["Day"] < 25]
train_df2 = pd.concat([train_df, test_df2])

In [17]:
train_df2.columns

Index(['Station', 'Operational Capacity(MW)', 'Latitude', 'Longitude', 'Month',
       'Day', 'Temperature', 'Cloud Type', 'Relative Humidity',
       'Aerosol Optical Depth', 'Clearsky DHI', 'Clearsky DNI', 'Clearsky GHI',
       'DHI', 'DNI', 'GHI', 'Actual Generation(Mu)'],
      dtype='object')

In [18]:
RFR = RandomForestRegressor()
RFR.fit(train_df2.drop(columns=["Actual Generation(Mu)", "Station"]), train_df2["Actual Generation(Mu)"])

In [19]:
joblib.dump(RFR, "solar_generation_model.pkl")

['solar_generation_model.pkl']