In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import joblib

In [2]:
df = pd.read_csv("datasets/transformed_data.csv")

In [3]:
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
0,Ahmedabad,2015-01-01,67.450578,118.127103,0.92,18.22,17.15,23.483476,0.92,27.64,133.36,0.0,0.02,0.0,166.463581
1,Ahmedabad,2015-01-02,67.450578,118.127103,0.97,15.69,16.46,23.483476,0.97,24.55,34.06,3.68,5.5,3.77,166.463581
2,Ahmedabad,2015-01-03,67.450578,118.127103,17.4,19.3,29.7,23.483476,17.4,29.07,30.7,6.8,16.4,2.25,166.463581
3,Ahmedabad,2015-01-04,67.450578,118.127103,1.7,18.48,17.97,23.483476,1.7,18.59,36.08,4.43,10.14,1.0,166.463581
4,Ahmedabad,2015-01-05,67.450578,118.127103,22.1,21.42,37.76,23.483476,22.1,39.33,39.31,7.01,18.89,2.78,166.463581


In [4]:
# Split the data into features and target variable
X = df.drop(['City', 'Date', 'AQI','Toluene','Xylene','Benzene'], axis=1)  # Features
y = df['AQI']  # Target variable

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

In [7]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1355.4522595670396


In [8]:
# Calculate the R-squared value
r2 = r2_score(y_test, y_pred)
print("R-squared value:", r2)

R-squared value: 0.8823715314740577


In [9]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150, 200],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt','log2'],      # Number of features to consider at every split
    'max_depth': [None, 10, 20, 30],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]             # Method of selecting samples for training each tree
}

In [10]:
# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Randomized Search Cross Validation
rf_random = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_grid,
                               n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# Perform the search
rf_random.fit(X_train, y_train)

# Best parameters found
print("Best parameters found:", rf_random.best_params_)



Fitting 3 folds for each of 100 candidates, totalling 300 fits


99 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
11 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\shrit\anaconda3\envs\mlenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\shrit\anaconda3\envs\mlenv\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "c:\Users\shrit\anaconda3\envs\mlenv\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\shrit\anaconda3\envs\mlenv\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_par

Best parameters found: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': None, 'bootstrap': False}


In [15]:
# Refit the model with best parameters
best_rf_regressor = rf_random.best_estimator_
best_rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = best_rf_regressor.predict(X_test)

In [16]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1296.123453631719


In [18]:
# Calculate the R-squared value
r2 = r2_score(y_test, y_pred)
print("R-squared value:", r2)

R-squared value: 0.8875201868637161


In [19]:
joblib.dump(best_rf_regressor, 'random_forest_reg.pkl')

['random_forest_reg.pkl']