# Air Quality Index (AQI) Prediction Model
This notebook loads and processes air quality data to train a machine learning model for predicting AQI.

In [3]:
# Step 1: Import Required Libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib

In [4]:
# Step 2: Load Data
df = pd.read_csv("city_day.csv")
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,01-01-2015,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,02-01-2015,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,03-01-2015,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,04-01-2015,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,05-01-2015,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [5]:
# Step 3: Preprocess Data
# Drop rows where AQI is missing
df = df.dropna(subset=["AQI"])

# Convert Date to datetime
df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)

In [6]:
# Step 4: Feature and Target Selection
features = ["PM2.5", "PM10", "NO", "NO2", "NOx", "NH3", "CO", "SO2", "O3", "Benzene", "Toluene", "Xylene"]
target = "AQI"

X = df[features]
y = df[target]

In [7]:
# Step 5: Handle Missing Values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

In [8]:
# Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

In [9]:
# Step 7: Train Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Step 8: Evaluate Model
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))

RMSE: 40.49104388326869
R2 Score: 0.9104624138493865


In [10]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Linear Regression")
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))


Linear Regression
RMSE: 59.1083615867004
R2 Score: 0.8091969651150288


In [11]:
from sklearn.svm import SVR

model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("SVR")
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))


SVR
RMSE: 136.03060723765503
R2 Score: -0.010556753891726833


In [12]:
from sklearn.ensemble import GradientBoostingRegressor

model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Gradient Boosting")
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))


Gradient Boosting
RMSE: 43.800702307187315
R2 Score: 0.8952269507157379


In [13]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("KNN Regressor")
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))


KNN Regressor
RMSE: 48.94382967010918
R2 Score: 0.8691772325724496


In [14]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("XGBoost Regressor")
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2 Score:", r2_score(y_test, y_pred))


XGBoost Regressor
RMSE: 42.66947305614461
R2 Score: 0.9005689570872484


In [16]:
# Step 9: Save Model and Imputer
joblib.dump(model, "airquality.joblib")


['airquality.joblib']