<a href="https://www.kaggle.com/code/vijayshindedsml/cleanest-route-random-forest?scriptVersionId=153296599" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Import Neccessary libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)





# Load the data

In [2]:

data = pd.read_csv('/kaggle/input/10times-clean-route/clean_route.csv')


# Data info

In [3]:
data.shape

(67000, 10)

In [4]:
data.head()


Unnamed: 0,Traffic,AQI_Score,Green_Cover,Total_Distance,Safety_Rating,Time_Taken,Route_Name,Temperature_Celsius,Atmosphere,Cleanliness_Score
0,4,17.796368,0.666718,22.242334,4.0,57.7444,Route B,24.02964,Rainy,5.516678
1,5,33.030025,0.452534,6.570725,4.0,80.473382,Route A,27.819676,Rainy,5.371988
2,3,16.110968,0.322666,20.689917,,53.483679,Route A,27.68856,,0.0
3,5,78.150916,0.474338,11.50788,1.0,47.39559,Route C,26.816281,Rainy,5.317906
4,5,26.228707,0.755373,8.694903,2.0,96.196222,Route A,23.349831,Rainy,5.867607


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67000 entries, 0 to 66999
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Traffic              67000 non-null  int64  
 1   AQI_Score            67000 non-null  float64
 2   Green_Cover          67000 non-null  float64
 3   Total_Distance       67000 non-null  float64
 4   Safety_Rating        60300 non-null  float64
 5   Time_Taken           67000 non-null  float64
 6   Route_Name           67000 non-null  object 
 7   Temperature_Celsius  67000 non-null  float64
 8   Atmosphere           60300 non-null  object 
 9   Cleanliness_Score    67000 non-null  float64
dtypes: float64(7), int64(1), object(2)
memory usage: 5.1+ MB


# Check for Null Values

In [6]:
data.isnull().sum()

Traffic                   0
AQI_Score                 0
Green_Cover               0
Total_Distance            0
Safety_Rating          6700
Time_Taken                0
Route_Name                0
Temperature_Celsius       0
Atmosphere             6700
Cleanliness_Score         0
dtype: int64

In [7]:
# Seperating Cat and Num cols ,Handle Null Values and encoding
from sklearn.impute import SimpleImputer 
cat_cols=list(data.select_dtypes(include=['object']).columns)
num_cols=list(data.select_dtypes(exclude=['object']).columns)

num_imputer=SimpleImputer(strategy='median')
cat_imputer=SimpleImputer(strategy='most_frequent')

num_imputer.fit(data[num_cols])           
cat_imputer.fit(data[cat_cols])


data[num_cols]=num_imputer.transform(data[num_cols])              
data[cat_cols]=cat_imputer.transform(data[cat_cols]) 

# Detecting outliers in data using Zscore

In [8]:
from scipy.stats import zscore

z_scores = zscore(data[num_cols])

threshold = 3 # Set a Z-score threshold (e.g., 3)
outliers = (z_scores > threshold).any(axis=1)

# Display rows with outliers
print("Rows with outliers:")
print(data[outliers])


Rows with outliers:
       Traffic  AQI_Score  Green_Cover  Total_Distance  Safety_Rating  \
208        4.0  20.644610     0.771775       27.631479            2.0   
361        4.0  63.549138     0.068679       18.379211            3.0   
769        1.0  82.975559     0.821536       10.286571            5.0   
1121       1.0  66.151106     0.550558       23.539966            4.0   
4122       3.0  75.561721     0.414486       19.639644            3.0   
...        ...        ...          ...             ...            ...   
61189      2.0  86.419310     0.134485       24.474220            3.0   
61236      5.0  49.712874     0.511238        9.685811            4.0   
62411      1.0  69.892814     0.052071       23.383466            1.0   
63450      5.0  52.556139     0.939395       14.393265            3.0   
66884      3.0  83.901291     0.801581       22.398075            1.0   

       Time_Taken Route_Name  Temperature_Celsius Atmosphere  \
208     56.118249    Route C           

# Removing outliers 

In [9]:
# Remove rows with outliers
df= data[~outliers].copy()

# Seperating Categorical and Numerical coloumns ,Handle Null Values 

In [10]:
# Seperating Cat and Num cols ,Handle Null Values and encoding
from sklearn.impute import SimpleImputer 
cat_cols=list(df.select_dtypes(include=['object']).columns)
num_cols=list(df.select_dtypes(exclude=['object']).columns)

num_imputer=SimpleImputer(strategy='median')
cat_imputer=SimpleImputer(strategy='most_frequent')

num_imputer.fit(df[num_cols])           
cat_imputer.fit(df[cat_cols])


df[num_cols]=num_imputer.transform(df[num_cols])              
df[cat_cols]=cat_imputer.transform(df[cat_cols]) 

# Removing 'Total_Distance' and 'Time_Taken' features

In [11]:
num_cols.remove('Total_Distance')
num_cols.remove('Time_Taken')

In [12]:
num_cols

['Traffic',
 'AQI_Score',
 'Green_Cover',
 'Safety_Rating',
 'Temperature_Celsius',
 'Cleanliness_Score']

In [13]:
cat_cols

['Route_Name', 'Atmosphere']

# Feature Encoding

In [14]:
# One Hot Encoding
ohe=OneHotEncoder(sparse=False)
ohe.fit(df[cat_cols])

# print new features created by ohe

In [15]:
new_cols=list(ohe.get_feature_names_out())
new_cols

['Route_Name_Route A',
 'Route_Name_Route B',
 'Route_Name_Route C',
 'Atmosphere_Cloudy',
 'Atmosphere_Rainy',
 'Atmosphere_Sunny']

In [16]:
ohe.transform(df[cat_cols])

array([[0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1.],
       ...,
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 1.]])

# adding new features to df

In [17]:
df[new_cols]=ohe.transform(df[cat_cols])

In [18]:
df

Unnamed: 0,Traffic,AQI_Score,Green_Cover,Total_Distance,Safety_Rating,Time_Taken,Route_Name,Temperature_Celsius,Atmosphere,Cleanliness_Score,Route_Name_Route A,Route_Name_Route B,Route_Name_Route C,Atmosphere_Cloudy,Atmosphere_Rainy,Atmosphere_Sunny
0,4.0,17.796368,0.666718,22.242334,4.0,57.744400,Route B,24.029640,Rainy,5.516678,0.0,1.0,0.0,0.0,1.0,0.0
1,5.0,33.030025,0.452534,6.570725,4.0,80.473382,Route A,27.819676,Rainy,5.371988,1.0,0.0,0.0,0.0,1.0,0.0
2,3.0,16.110968,0.322666,20.689917,3.0,53.483679,Route A,27.688560,Sunny,0.000000,1.0,0.0,0.0,0.0,0.0,1.0
3,5.0,78.150916,0.474338,11.507880,1.0,47.395590,Route C,26.816281,Rainy,5.317906,0.0,0.0,1.0,0.0,1.0,0.0
4,5.0,26.228707,0.755373,8.694903,2.0,96.196222,Route A,23.349831,Rainy,5.867607,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66995,3.0,89.351109,0.401308,9.439433,1.0,117.215559,Route C,15.455682,Sunny,3.858451,0.0,0.0,1.0,0.0,0.0,1.0
66996,3.0,50.706574,0.081055,5.561437,5.0,58.539711,Route B,24.031575,Rainy,4.048277,0.0,1.0,0.0,0.0,1.0,0.0
66997,5.0,45.037120,0.111728,26.167179,3.0,105.363259,Route A,24.228873,Cloudy,4.504231,1.0,0.0,0.0,1.0,0.0,0.0
66998,3.0,28.555282,0.351152,23.505227,2.0,27.864247,Route A,29.998717,Cloudy,5.508407,1.0,0.0,0.0,1.0,0.0,0.0


In [19]:
df[num_cols+new_cols]

Unnamed: 0,Traffic,AQI_Score,Green_Cover,Safety_Rating,Temperature_Celsius,Cleanliness_Score,Route_Name_Route A,Route_Name_Route B,Route_Name_Route C,Atmosphere_Cloudy,Atmosphere_Rainy,Atmosphere_Sunny
0,4.0,17.796368,0.666718,4.0,24.029640,5.516678,0.0,1.0,0.0,0.0,1.0,0.0
1,5.0,33.030025,0.452534,4.0,27.819676,5.371988,1.0,0.0,0.0,0.0,1.0,0.0
2,3.0,16.110968,0.322666,3.0,27.688560,0.000000,1.0,0.0,0.0,0.0,0.0,1.0
3,5.0,78.150916,0.474338,1.0,26.816281,5.317906,0.0,0.0,1.0,0.0,1.0,0.0
4,5.0,26.228707,0.755373,2.0,23.349831,5.867607,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
66995,3.0,89.351109,0.401308,1.0,15.455682,3.858451,0.0,0.0,1.0,0.0,0.0,1.0
66996,3.0,50.706574,0.081055,5.0,24.031575,4.048277,0.0,1.0,0.0,0.0,1.0,0.0
66997,5.0,45.037120,0.111728,3.0,24.228873,4.504231,1.0,0.0,0.0,1.0,0.0,0.0
66998,3.0,28.555282,0.351152,2.0,29.998717,5.508407,1.0,0.0,0.0,1.0,0.0,0.0


# Separate features and target variable

In [20]:

X = df[num_cols+new_cols].drop('Cleanliness_Score', axis=1)
y = df[num_cols+new_cols]['Cleanliness_Score']


# Split the data into training and testing sets putting 20% data for testing

In [21]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)


# Random Forest Regressor(Without HyperParameter Tunning)

In [22]:
model_rf=RandomForestRegressor()
model_rf.fit(X_train,y_train)

# making predictions on test set

In [23]:
y_pred=model_rf.predict(X_test)
y_pred

array([6.26931649, 2.51586049, 4.78142779, ..., 5.44878352, 1.19350315,
       4.59059378])

# Evaluate the model

In [24]:

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:",mse)

Mean Squared Error on Test Set: 2.0177905637949127


Here without hyperparameter tunning we are getting mse as 2.0166, we need to reduce it using hyperparameter tunning

# RandomForestRegressor(With hyperparameter tunning)

In [25]:
from sklearn.model_selection import RandomizedSearchCV
params = {'n_estimators':[25,50,75,100,125,150,175,200],
          'max_depth':[1,2,3,4,5],
          'min_samples_leaf':[2,4,6,8]}
random_cv = RandomizedSearchCV(RandomForestRegressor(), params, cv=5)
random_cv.fit(X_train,y_train)

# lets find out best estimator

In [26]:
random_cv.best_estimator_

# making final model with these best parameters

In [27]:
final_model=RandomForestRegressor(max_depth= 5, min_samples_leaf= 2, n_estimators= 75)

# Fitting our final model

In [28]:
final_model.fit(X_train,y_train)

# Making Predictions 

In [29]:
ypred=final_model.predict(X_test)

In [30]:
ypred

array([6.03146441, 1.84823602, 5.10420415, ..., 5.4289694 , 1.96052915,
       4.80434521])

# Lets find mse

In [31]:
mse = mean_squared_error(y_test, ypred)
print("Mean Squared Error on Test Set:",mse)

Mean Squared Error on Test Set: 1.880815094597999


After fine tunning model we got mse as 1.8812

# Lets save our trained model

In [32]:
import pickle
pickle.dump(final_model,open('model_rf.pkl','wb'))

# Prediction on own data.
we load the saved model and make prediction
lets take a query point q 
for our problem statement , we can identify cleanest running route by using Cleanliness_Score,
by setting some threshold as 8.0 (because Cleanliness_Score range is between 0.0-10.0),so the route
whose Cleanliness_Score is greater than or equal to 8 is said to be Cleanest Running Route 
otherwise not(it my assumption, you can change threshold value to 6.0 ,7.0 or 9.0 as per your 
             requirement)

# Load saved model

In [33]:
pickle.load(open('model_rf.pkl','rb'))

In [34]:
q=[5.0,63.90,0.54,2.0,25.56,1.000,0.0,0.0,0.0,0.0,1.0]
if model_rf.predict([q])>=8.0:
    print("Route is Cleanest")
else:
    print("Route is not clean")

Route is not clean


