In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, r2_score


In [2]:
df = pd.read_csv("../data/processed/cleaned_data.csv")

df.head()


Unnamed: 0,City,Datetime,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI
0,Amaravati,2017-11-25 09:00:00,104.0,148.5,1.93,23.0,13.75,9.8,0.1,15.3,117.62,0.3,10.4,0.23,155.0
1,Amaravati,2017-11-25 10:00:00,94.5,142.0,1.33,16.25,9.75,9.65,0.1,17.0,136.23,0.28,7.1,0.15,159.0
2,Amaravati,2017-11-25 11:00:00,82.75,126.5,1.47,14.83,9.07,9.7,0.1,15.4,149.92,0.2,4.55,0.08,173.0
3,Amaravati,2017-11-25 14:00:00,68.5,117.0,1.35,13.6,8.35,7.4,0.1,21.8,161.7,0.1,2.3,0.0,191.0
4,Amaravati,2017-11-25 15:00:00,69.25,112.25,1.52,11.8,7.55,9.25,0.1,21.38,161.68,0.1,2.35,0.0,191.0


In [3]:
if 'City' in df.columns:
    df = df.drop(columns=['City'])

if 'Datetime' in df.columns:
    df['Datetime'] = pd.to_datetime(df['Datetime'])
    df['month'] = df['Datetime'].dt.month
    df['day'] = df['Datetime'].dt.day
    df = df.drop(columns=['Datetime'])


In [4]:
X = df.drop('AQI', axis=1)
y = df['AQI']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor()
}


In [7]:
results = []

for name, model in models.items():

    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, pred))
    r2 = r2_score(y_test, pred)

    print(name, "RMSE:", rmse, "R2:", r2)

    results.append((name, rmse, r2))


Linear Regression RMSE: 57.213685467771136 R2: 0.7761336562366394
Decision Tree RMSE: 49.05457392230894 R2: 0.8354310213400927
Random Forest RMSE: 33.569377417044905 R2: 0.9229317439225253
Gradient Boosting RMSE: 45.391486919147034 R2: 0.8590913101214255


In [8]:
results_df = pd.DataFrame(results, columns=["Model", "RMSE", "R2"])

results_df


Unnamed: 0,Model,RMSE,R2
0,Linear Regression,57.213685,0.776134
1,Decision Tree,49.054574,0.835431
2,Random Forest,33.569377,0.922932
3,Gradient Boosting,45.391487,0.859091
