In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")


In [2]:
file_path = 'processed_dataset.csv' 
df= pd.read_csv(file_path)

In [3]:
X= df.drop(columns=['Recycling Rate (%)'])
y= df['Recycling Rate (%)']

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score,root_mean_squared_error


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X_train

Unnamed: 0,Waste Type,Waste Generated (Tons/Day),Population Density (People/km²),Municipal Efficiency Score (1-10),Disposal Method,Cost of Waste Management (₹/Ton),Awareness Campaigns Count,Landfill Capacity (Tons),Year,city
332,3,8893,16101,9,1,2435,5,67316,2020,14
383,4,917,18418,7,2,1559,14,30373,2020,16
281,2,5690,18697,7,2,916,14,22690,2020,12
2,3,8162,11191,8,2,3390,13,45575,2019,1
231,2,974,23465,10,3,1791,2,80251,2020,10
...,...,...,...,...,...,...,...,...,...,...
71,2,7135,18195,5,2,3919,17,52609,2023,3
106,2,6472,17335,5,3,3328,14,80985,2020,5
270,1,7045,24032,7,2,4404,13,38471,2023,11
435,1,7724,21598,8,1,509,11,24927,2021,18


In [16]:
X_train.shape

(680, 10)

In [17]:
X_test.shape

(170, 10)

In [21]:
model={"LinearRegression":LinearRegression(),
       "RandomForestRegressor":RandomForestRegressor(), 
       "Tree":DecisionTreeRegressor(),
       }

In [22]:
# model training for each model using for loop fro name, model in model.items():
# for best model selection:
for name, model in model.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)*100  
    root_mse = np.sqrt(mse)
    
    print(f"{name} - MSE: {mse}, R2: {r2} , Root MSE: {root_mse}")

LinearRegression - MSE: 277.79130538548606, R2: -2.1241026725161705 , Root MSE: 16.66707248995714
RandomForestRegressor - MSE: 302.4142929411765, R2: -11.176223673034702 , Root MSE: 17.390063051673405
Tree - MSE: 575.4529411764706, R2: -111.5531123854165 , Root MSE: 23.98860023378752


In [23]:
mod=XGBRegressor(n_estimators=100, learning_rate=0.1,random_state=42)

In [25]:
mod.fit(X_train, y_train)
y_pred = mod.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)*100
root_mse = np.sqrt(mse)

In [26]:
mse, r2, root_mse

(350.62896728515625, -28.901326656341553, np.float64(18.72508924638695))

In [27]:
cat_mod=CatBoostRegressor(iterations=100, learning_rate=0.1, random_state=42, verbose=0)
cat_mod.fit(X_train, y_train)
y_pred = cat_mod.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)*100
root_mse = np.sqrt(mse)

In [28]:
mse, r2, root_mse

(297.53285871589463, -9.381667542776984, np.float64(17.2491408109475))

# from mse score we can use CatBoost Regressor for the machine learning model 

# now performing the hyperparameter tuning for catboost regressor using grid search CV

In [29]:
params = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8], 
    'l2_leaf_reg': [1, 3, 5]
}

In [30]:
hy_model= GridSearchCV(estimator=cat_mod, param_grid=params, cv=3, scoring='neg_mean_squared_error', verbose=1)
hy_model.fit(X_train, y_train)  

Fitting 3 folds for each of 54 candidates, totalling 162 fits


In [31]:
best_params = hy_model.best_params_
best_params

{'depth': 4, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.01}