In [45]:
import pandas as pd 
import numpy as np 


In [46]:
df=pd.read_csv("Solar_Preprocessed.csv")

print(df.head())
 

         Date                        State  Latitude  Longitude  CLOUD_AMT  \
0  01-01-2015  Andaman and Nicobar Islands   11.6234    92.7265      47.43   
1  02-01-2015  Andaman and Nicobar Islands   11.6234    92.7265      34.39   
2  03-01-2015  Andaman and Nicobar Islands   11.6234    92.7265      26.37   
3  04-01-2015  Andaman and Nicobar Islands   11.6234    92.7265      28.68   
4  05-01-2015  Andaman and Nicobar Islands   11.6234    92.7265      14.47   

   AOD_55    PW   QV2M    T2M   RH2M  WS50M      PS  TOA_SW_DWN  Month  \
0    0.20  5.29  18.15  27.87  77.74   9.18  101.09      8.4209      1   
1    0.23  5.00  18.01  27.59  78.50   7.11  101.22      8.4300      1   
2    0.27  3.92  16.08  27.48  70.64   5.16  101.24      8.4408      1   
3    0.30  3.07  15.72  26.89  71.45   3.84  101.12      8.4538      1   
4    0.21  3.02  14.94  27.03  67.34   2.81  101.07      8.4684      1   

   Season  Zenith_Angle  DayLength_hours  Solar_Potential_GHI  
0  Winter     34.63503

In [47]:
df = df.drop(columns=['Date','State'])

In [48]:
X = df.drop(columns=['Solar_Potential_GHI'])
y = df['Solar_Potential_GHI']


In [49]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


In [50]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)


In [51]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('rf', RandomForestRegressor(n_estimators=200, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42))
]

stack_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge()    
)



In [52]:
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
 
bag = BaggingRegressor(
    n_estimators=50,
    random_state=42,
    n_jobs=1
)

rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=1
)

xgb = XGBRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=1,
    verbosity=0
) 
voting_model = VotingRegressor(
    estimators=[
        ('bag', bag),
        ('rf',  rf),
        ('xgb', xgb)
    ],
    n_jobs=1
)


In [53]:
model_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', voting_model)
])

model_2_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', stack_model)
])

In [54]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [55]:

model_2_pipeline.fit(X_train, y_train)

print("Stacking Model R²:", model_2_pipeline.score(X_test, y_test))

Stacking Model R²: 0.8861104801495117


In [56]:
from sklearn.metrics import r2_score, mean_absolute_error
model_pipeline.fit(X_train,y_train)
y_pred = model_pipeline.predict(X_test)

print("Voting Ensemble R²:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred)) 


Voting Ensemble R²: 0.8880624160857005
MAE: 0.3292603176470692


#   With Hypeparameter tuning  

In [61]:
param_distributions = {
    'model__rf__n_estimators': [80,110],          
    'model__rf__max_depth': [10,16,15,None],
    
    'model__xgb__n_estimators': [100, 200, 300],
    'model__xgb__max_depth': [5,7,10],
    'model__xgb__learning_rate': [0.1, 0.05,0.03],      
    'model__xgb__subsample': [0.8,1.0],
    
    'model__bag__n_estimators': [30,40]
}


In [62]:
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_distributions,
    n_iter=20,          
    cv=3,
    scoring='r2',
    verbose=2,
    random_state=42,
    n_jobs=1             
)

In [64]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END model__bag__n_estimators=30, model__rf__max_depth=16, model__rf__n_estimators=110, model__xgb__learning_rate=0.03, model__xgb__max_depth=5, model__xgb__n_estimators=100, model__xgb__subsample=0.8; total time= 1.4min
[CV] END model__bag__n_estimators=30, model__rf__max_depth=16, model__rf__n_estimators=110, model__xgb__learning_rate=0.03, model__xgb__max_depth=5, model__xgb__n_estimators=100, model__xgb__subsample=0.8; total time= 1.4min
[CV] END model__bag__n_estimators=30, model__rf__max_depth=16, model__rf__n_estimators=110, model__xgb__learning_rate=0.03, model__xgb__max_depth=5, model__xgb__n_estimators=100, model__xgb__subsample=0.8; total time= 1.4min
[CV] END model__bag__n_estimators=40, model__rf__max_depth=15, model__rf__n_estimators=110, model__xgb__learning_rate=0.05, model__xgb__max_depth=5, model__xgb__n_estimators=100, model__xgb__subsample=0.8; total time= 1.4min
[CV] END model__bag__n_estimators=40, m

0,1,2
,estimator,Pipeline(step... n_jobs=1))])
,param_distributions,"{'model__bag__n_estimators': [30, 40], 'model__rf__max_depth': [10, 16, ...], 'model__rf__n_estimators': [80, 110], 'model__xgb__learning_rate': [0.1, 0.05, ...], ...}"
,n_iter,20
,scoring,'r2'
,n_jobs,1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,estimators,"[('bag', ...), ('rf', ...), ...]"
,weights,
,n_jobs,1
,verbose,False

0,1,2
,estimator,
,n_estimators,30
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,1
,random_state,42

0,1,2
,n_estimators,110
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [65]:
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)


Best Parameters: {'model__xgb__subsample': 1.0, 'model__xgb__n_estimators': 200, 'model__xgb__max_depth': 10, 'model__xgb__learning_rate': 0.1, 'model__rf__n_estimators': 110, 'model__rf__max_depth': None, 'model__bag__n_estimators': 30}
Best Score: 0.8774876295236584
