<h3>Data Preparation</h3>

In [9]:
import pandas as pd
import numpy as np

# Load and clean data
df = pd.read_csv(r"C:\Users\HP\OneDrive\Desktop\Project3\Solar_Panel_Regression-\solarpowergeneration.csv")
# Rename columns for better readability
df.rename(columns={
    'distance-to-solar-noon': 'distance_to_solar_noon',
    'wind-direction': 'wind_direction',
    'wind-speed': 'wind_speed',
    'sky-cover': 'sky_cover',
    'average-wind-speed-(period)': 'average_wind_speed',
    'average-pressure-(period)': 'average_pressure',
    'power-generated': 'power_generated'
}, inplace=True)
df = df.replace([np.inf, -np.inf], np.nan).dropna()


<h3>Feature Extraction</h3>

In [12]:
# Create new features
df["wind_power"] = df["wind_speed"] ** 2
df["temp_humidity_ratio"] = df["temperature"] / (df["humidity"] + 1)


In [38]:
df

Unnamed: 0,distance_to_solar_noon,temperature,wind_direction,wind_speed,sky_cover,visibility,humidity,average_wind_speed,average_pressure,power_generated,wind_power,temp_humidity_ratio
0,0.859897,69,28,7.5,0,10.0,75,8.0,29.82,0,56.25,0.907895
1,0.628535,69,28,7.5,0,10.0,77,5.0,29.85,0,56.25,0.884615
2,0.397172,69,28,7.5,0,10.0,70,0.0,29.89,5418,56.25,0.971831
3,0.165810,69,28,7.5,0,10.0,33,0.0,29.91,25477,56.25,2.029412
4,0.065553,69,28,7.5,0,10.0,21,3.0,29.89,30069,56.25,3.136364
...,...,...,...,...,...,...,...,...,...,...,...,...
2915,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,6995,193.21,0.828947
2916,0.064020,63,27,13.9,1,10.0,66,15.0,29.91,29490,193.21,0.940299
2917,0.294494,63,27,13.9,2,10.0,68,21.0,29.88,17257,193.21,0.913043
2918,0.524968,63,27,13.9,2,10.0,81,17.0,29.87,677,193.21,0.768293


<h3>Define Features and Target</h3>

In [15]:
X = df.drop("power_generated", axis=1)
y = df["power_generated"]


<h3>Train-Test Split</h3>

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2335, 11), (584, 11), (2335,), (584,))

<h3>Feature Scaling</h3>

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled
X_test_scaled

array([[-0.14980549,  0.8177946 ,  0.5937747 , ..., -0.96858918,
         0.23937467,  0.01644763],
       [ 0.53265376,  0.52530009,  0.30716717, ..., -0.26562218,
         1.12963648, -0.21071986],
       [ 0.84827168,  1.11028911,  0.73707846, ..., -2.02303969,
         0.06702505,  0.02844278],
       ...,
       [ 0.5592057 ,  0.8177946 ,  0.16386341, ..., -0.40621558,
        -0.55077229, -0.07829891],
       [-1.23963227, -0.35218343, -1.69908553, ...,  1.07001513,
        -1.0374065 ,  0.71216657],
       [ 0.93012183,  0.52530009,  0.73707846, ...,  0.71853163,
        -0.88533331, -0.391115  ]])

<h3>Train All Models</h3>

In [24]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR(),
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42)
}

results = []

for name, model in models.items():
    print(f"\nModel: {name}")
    
    if name == "SVR":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results.append((name, r2, rmse))
    print(f"R2 Score: {r2:.4f}, RMSE: {rmse:.2f}")



Model: Linear Regression
R2 Score: 0.6350, RMSE: 6336.83

Model: Ridge Regression
R2 Score: 0.6345, RMSE: 6341.16

Model: Lasso Regression
R2 Score: 0.6349, RMSE: 6337.93

Model: Random Forest
R2 Score: 0.9054, RMSE: 3225.33

Model: Gradient Boosting
R2 Score: 0.9188, RMSE: 2989.59

Model: SVR
R2 Score: -0.3983, RMSE: 12403.18

Model: XGBoost
R2 Score: 0.9052, RMSE: 3229.84


<h3>Hyperparameter Tuning</h3>

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define full pipeline
pipe = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler()),
    ("model", RandomForestRegressor(random_state=42))
])

# Hyperparameter grid
param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20]
}

# Grid search with CV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring="r2")
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
best_pipe = grid.best_estimator_
print("CV R2 score:", grid.best_score_)


Best params: {'model__max_depth': 20, 'model__n_estimators': 200}
CV R2 score: 0.9052441266869689


<h3>Compare All Models</h3>

In [27]:
results_df = pd.DataFrame(results, columns=["Model", "R2 Score", "RMSE"])
results_df = results_df.sort_values(by="R2 Score", ascending=False)
print("\n Model Comparison:\n", results_df)



 Model Comparison:
                Model  R2 Score          RMSE
4  Gradient Boosting  0.918760   2989.589613
3      Random Forest  0.905443   3225.334357
6            XGBoost  0.905178   3229.841319
0  Linear Regression  0.635003   6336.825973
2   Lasso Regression  0.634875   6337.933047
1   Ridge Regression  0.634504   6341.158981
5                SVR -0.398338  12403.184211


<h3>Select and Save the Best Model</h3>

In [30]:
import pickle

best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]

# Refit full training data before saving
if best_model_name == "SVR":
    best_model.fit(X_train_scaled, y_train)
    pickle.dump((best_model, scaler, X.columns.tolist()), open("best_model.pkl", "wb"))
else:
    best_model.fit(X_train, y_train)
    pickle.dump((best_model, None, X.columns.tolist()), open("best_model.pkl", "wb"))

print(f"\n Best model is '{best_model_name}' and saved as 'best_model.pkl'")



 Best model is 'Gradient Boosting' and saved as 'best_model.pkl'


<h3>Saving the Tuned Pipeline for Deployment</h3>

In [36]:
import joblib

# Save the entire best pipeline
joblib.dump(best_pipe, "best_pipeline.joblib", compress=3)
print(" Best pipeline saved as 'best_pipeline.joblib'")


 Best pipeline saved as 'best_pipeline.joblib'
