In [1]:
import pandas as pd

# Load cleaned dataset
df = pd.read_csv("solar_weather_cleaned.csv")

# Convert Date column to datetime (important)
df["Date"] = pd.to_datetime(df["Date"])

# Basic inspection
print("Shape:", df.shape)
print("\nColumns:", df.columns)
print("\nData Types:\n", df.dtypes)

df.head()


Shape: (3653, 9)

Columns: Index(['Date', 'Solar_Radiation', 'Temperature', 'Humidity', 'Wind_Speed',
       'Year', 'Month', 'Day', 'Season'],
      dtype='object')

Data Types:
 Date               datetime64[ns]
Solar_Radiation           float64
Temperature               float64
Humidity                  float64
Wind_Speed                float64
Year                        int64
Month                       int64
Day                         int64
Season                      int64
dtype: object


Unnamed: 0,Date,Solar_Radiation,Temperature,Humidity,Wind_Speed,Year,Month,Day,Season
0,2014-01-01,5.8342,26.1,65.51,2.05,2014,1,1,1
1,2014-01-02,5.6846,26.11,66.76,1.77,2014,1,2,1
2,2014-01-03,5.2409,26.66,66.25,1.55,2014,1,3,1
3,2014-01-04,5.0405,27.19,66.5,1.02,2014,1,4,1
4,2014-01-05,5.6196,26.57,65.66,1.42,2014,1,5,1


In [2]:
# Check missing values
print("Missing Values:\n", df.isnull().sum())

# Check duplicates
print("\nDuplicate Rows:", df.duplicated().sum())

# Basic statistics
print("\nBasic Statistics:\n")
print(df.describe())


Missing Values:
 Date               0
Solar_Radiation    0
Temperature        0
Humidity           0
Wind_Speed         0
Year               0
Month              0
Day                0
Season             0
dtype: int64

Duplicate Rows: 0

Basic Statistics:

                      Date  Solar_Radiation  Temperature     Humidity  \
count                 3653      3653.000000  3653.000000  3653.000000   
mean   2019-01-01 00:00:00         5.272401    27.566797    78.720610   
min    2014-01-01 00:00:00         0.654000    24.530000    40.930000   
25%    2016-07-02 00:00:00         4.613500    26.500000    71.710000   
50%    2019-01-01 00:00:00         5.491900    27.260000    81.500000   
75%    2021-07-02 00:00:00         6.130600    28.480000    87.160000   
max    2024-01-01 00:00:00         7.413800    31.850000    94.370000   
std                    NaN         1.185252     1.379395    10.390302   

        Wind_Speed         Year        Month          Day       Season  
count  3653

In [3]:
import numpy as np

# 1️⃣ Create Lag Feature (Yesterday's Solar)
df["Solar_Lag1"] = df["Solar_Radiation"].shift(1)

# 2️⃣ Create Cyclical Month Features
df["Month_sin"] = np.sin(2 * np.pi * df["Month"] / 12)
df["Month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)

# 3️⃣ Drop first row (because lag creates NaN)
df = df.dropna()

# 4️⃣ Drop unnecessary columns
df_model = df.drop(columns=["Date", "Year"])

print("New Shape:", df_model.shape)
df_model.head()


New Shape: (3652, 10)


Unnamed: 0,Solar_Radiation,Temperature,Humidity,Wind_Speed,Month,Day,Season,Solar_Lag1,Month_sin,Month_cos
1,5.6846,26.11,66.76,1.77,1,2,1,5.8342,0.5,0.866025
2,5.2409,26.66,66.25,1.55,1,3,1,5.6846,0.5,0.866025
3,5.0405,27.19,66.5,1.02,1,4,1,5.2409,0.5,0.866025
4,5.6196,26.57,65.66,1.42,1,5,1,5.0405,0.5,0.866025
5,5.5361,27.09,67.66,1.56,1,6,1,5.6196,0.5,0.866025


In [4]:
# Use original df (which still has Date)
df_sorted = df.sort_values("Date")

# Train: 2014–2022
train = df_sorted[df_sorted["Date"] < "2023-01-01"]

# Test: 2023–2024
test = df_sorted[df_sorted["Date"] >= "2023-01-01"]

# Drop Date and Year from both
train = train.drop(columns=["Date", "Year"])
test = test.drop(columns=["Date", "Year"])

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

train.head()


Train Shape: (3286, 10)
Test Shape: (366, 10)


Unnamed: 0,Solar_Radiation,Temperature,Humidity,Wind_Speed,Month,Day,Season,Solar_Lag1,Month_sin,Month_cos
1,5.6846,26.11,66.76,1.77,1,2,1,5.8342,0.5,0.866025
2,5.2409,26.66,66.25,1.55,1,3,1,5.6846,0.5,0.866025
3,5.0405,27.19,66.5,1.02,1,4,1,5.2409,0.5,0.866025
4,5.6196,26.57,65.66,1.42,1,5,1,5.0405,0.5,0.866025
5,5.5361,27.09,67.66,1.56,1,6,1,5.6196,0.5,0.866025


In [5]:
# Separate features and target

X_train = train.drop(columns=["Solar_Radiation"])
y_train = train["Solar_Radiation"]

X_test = test.drop(columns=["Solar_Radiation"])
y_test = test["Solar_Radiation"]

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train.head()


X_train shape: (3286, 9)
y_train shape: (3286,)
X_test shape: (366, 9)
y_test shape: (366,)


Unnamed: 0,Temperature,Humidity,Wind_Speed,Month,Day,Season,Solar_Lag1,Month_sin,Month_cos
1,26.11,66.76,1.77,1,2,1,5.8342,0.5,0.866025
2,26.66,66.25,1.55,1,3,1,5.6846,0.5,0.866025
3,27.19,66.5,1.02,1,4,1,5.2409,0.5,0.866025
4,26.57,65.66,1.42,1,5,1,5.0405,0.5,0.866025
5,27.09,67.66,1.56,1,6,1,5.6196,0.5,0.866025


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Initialize model
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42
)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("R2 Score:", r2)
print("MAE:", mae)
print("RMSE:", rmse)


R2 Score: 0.5062729278779169
MAE: 0.6110277657113437
RMSE: 0.8280581372444848


In [8]:
# Create monthly dataset
df_monthly = df.copy()

df_monthly["YearMonth"] = df_monthly["Date"].dt.to_period("M")

monthly_data = df_monthly.groupby("YearMonth").agg({
    "Solar_Radiation": "mean",
    "Temperature": "mean",
    "Humidity": "mean",
    "Wind_Speed": "mean"
}).reset_index()

print("Monthly Shape:", monthly_data.shape)
monthly_data.head()


Monthly Shape: (121, 5)


Unnamed: 0,YearMonth,Solar_Radiation,Temperature,Humidity,Wind_Speed
0,2014-01,5.644547,27.443667,65.211,1.585
1,2014-02,6.119886,28.527143,64.121429,2.029286
2,2014-03,6.450461,30.044839,60.388065,1.96871
3,2014-04,6.001397,29.232667,76.288667,2.338667
4,2014-05,5.426194,28.453226,82.331613,2.474839


In [9]:
# Convert YearMonth back to datetime
monthly_data["YearMonth"] = monthly_data["YearMonth"].astype(str)
monthly_data["YearMonth"] = pd.to_datetime(monthly_data["YearMonth"])

# Extract Month
monthly_data["Month"] = monthly_data["YearMonth"].dt.month

# Cyclical encoding
import numpy as np
monthly_data["Month_sin"] = np.sin(2 * np.pi * monthly_data["Month"] / 12)
monthly_data["Month_cos"] = np.cos(2 * np.pi * monthly_data["Month"] / 12)

monthly_data.head()


Unnamed: 0,YearMonth,Solar_Radiation,Temperature,Humidity,Wind_Speed,Month,Month_sin,Month_cos
0,2014-01-01,5.644547,27.443667,65.211,1.585,1,0.5,0.8660254
1,2014-02-01,6.119886,28.527143,64.121429,2.029286,2,0.866025,0.5
2,2014-03-01,6.450461,30.044839,60.388065,1.96871,3,1.0,6.123234000000001e-17
3,2014-04-01,6.001397,29.232667,76.288667,2.338667,4,0.866025,-0.5
4,2014-05-01,5.426194,28.453226,82.331613,2.474839,5,0.5,-0.8660254


In [10]:
# Sort by date
monthly_data = monthly_data.sort_values("YearMonth")

# Train/Test split
train_m = monthly_data[monthly_data["YearMonth"] < "2023-01-01"]
test_m = monthly_data[monthly_data["YearMonth"] >= "2023-01-01"]

print("Train Monthly Shape:", train_m.shape)
print("Test Monthly Shape:", test_m.shape)

train_m.head()


Train Monthly Shape: (108, 8)
Test Monthly Shape: (13, 8)


Unnamed: 0,YearMonth,Solar_Radiation,Temperature,Humidity,Wind_Speed,Month,Month_sin,Month_cos
0,2014-01-01,5.644547,27.443667,65.211,1.585,1,0.5,0.8660254
1,2014-02-01,6.119886,28.527143,64.121429,2.029286,2,0.866025,0.5
2,2014-03-01,6.450461,30.044839,60.388065,1.96871,3,1.0,6.123234000000001e-17
3,2014-04-01,6.001397,29.232667,76.288667,2.338667,4,0.866025,-0.5
4,2014-05-01,5.426194,28.453226,82.331613,2.474839,5,0.5,-0.8660254


In [11]:
# Prepare training data
X_train_m = train_m.drop(columns=["Solar_Radiation", "YearMonth", "Month"])
y_train_m = train_m["Solar_Radiation"]

# Prepare testing data
X_test_m = test_m.drop(columns=["Solar_Radiation", "YearMonth", "Month"])
y_test_m = test_m["Solar_Radiation"]

print("X_train shape:", X_train_m.shape)
print("X_test shape:", X_test_m.shape)

X_train_m.head()


X_train shape: (108, 5)
X_test shape: (13, 5)


Unnamed: 0,Temperature,Humidity,Wind_Speed,Month_sin,Month_cos
0,27.443667,65.211,1.585,0.5,0.8660254
1,28.527143,64.121429,2.029286,0.866025,0.5
2,30.044839,60.388065,1.96871,1.0,6.123234000000001e-17
3,29.232667,76.288667,2.338667,0.866025,-0.5
4,28.453226,82.331613,2.474839,0.5,-0.8660254


In [12]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Initialize model
model_m = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42
)

# Train
model_m.fit(X_train_m, y_train_m)

# Predict
y_pred_m = model_m.predict(X_test_m)

# Evaluate
r2_m = r2_score(y_test_m, y_pred_m)
mae_m = mean_absolute_error(y_test_m, y_pred_m)
rmse_m = np.sqrt(mean_squared_error(y_test_m, y_pred_m))

print("Monthly Model Results")
print("R2 Score:", r2_m)
print("MAE:", mae_m)
print("RMSE:", rmse_m)


Monthly Model Results
R2 Score: 0.5557250353895198
MAE: 0.4212756707571971
RMSE: 0.5654742324429396


In [13]:
# Assumptions
system_size = 5   # kW
PR = 0.8
tariff = 6        # ₹ per kWh
cost_per_kw = 60000

# Add number of days in month
monthly_data["Days"] = monthly_data["YearMonth"].dt.days_in_month

# Calculate Monthly Energy
monthly_data["Monthly_Energy_kWh"] = (
    system_size *
    monthly_data["Solar_Radiation"] *
    monthly_data["Days"] *
    PR
)

# Yearly energy
yearly_energy = monthly_data["Monthly_Energy_kWh"].sum()

# Financial calculations
annual_savings = yearly_energy * tariff
total_system_cost = system_size * cost_per_kw
payback_years = total_system_cost / annual_savings

# CO2 Offset (0.82 kg per kWh)
co2_offset = yearly_energy * 0.82 / 1000  # convert to tons

print("===== SOLAR INTELLIGENCE REPORT =====")
print("System Size:", system_size, "kW")
print("Estimated Yearly Energy (kWh):", round(yearly_energy, 2))
print("Estimated Annual Savings (₹):", round(annual_savings, 2))
print("System Cost (₹):", total_system_cost)
print("Payback Period (Years):", round(payback_years, 2))
print("CO2 Offset (Tons per year):", round(co2_offset, 2))


===== SOLAR INTELLIGENCE REPORT =====
System Size: 5 kW
Estimated Yearly Energy (kWh): 77515.86
Estimated Annual Savings (₹): 465095.15
System Cost (₹): 300000
Payback Period (Years): 0.65
CO2 Offset (Tons per year): 63.56


In [14]:
# Average yearly energy (divide by number of years)
number_of_years = 10
average_yearly_energy = yearly_energy / number_of_years

annual_savings = average_yearly_energy * tariff
payback_years = total_system_cost / annual_savings
co2_offset = average_yearly_energy * 0.82 / 1000

print("===== CORRECTED SOLAR REPORT =====")
print("Average Yearly Energy (kWh):", round(average_yearly_energy, 2))
print("Annual Savings (₹):", round(annual_savings, 2))
print("Payback Period (Years):", round(payback_years, 2))
print("CO2 Offset (Tons per year):", round(co2_offset, 2))


===== CORRECTED SOLAR REPORT =====
Average Yearly Energy (kWh): 7751.59
Annual Savings (₹): 46509.52
Payback Period (Years): 6.45
CO2 Offset (Tons per year): 6.36


In [15]:
import joblib

# Save model
joblib.dump(model_m, "solar_monthly_model.pkl")

# Save feature column order
feature_columns = X_train_m.columns.tolist()
joblib.dump(feature_columns, "model_features.pkl")

print("Model and feature list saved successfully ✅")
print("Features:", feature_columns)


Model and feature list saved successfully ✅
Features: ['Temperature', 'Humidity', 'Wind_Speed', 'Month_sin', 'Month_cos']
