In [2]:
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# # Load data
# path = 'C:/Users/hp/Urban-Air-Quality-Monitoring/'
# extension = '.csv'
# files = [file for file in os.listdir(path) if file.endswith(extension)]
# dfs = [pd.read_csv(os.path.join(path, file)) for file in files]
# df = pd.concat(dfs, ignore_index=True)
# df.dropna(subset=['AQI'], inplace=True)
# df.drop(columns=['City', 'Date'], inplace=True)

df=pd.read_csv("city_day.csv")
# Impute missing values with the mean of each column
# Select numeric columns for imputation
numeric_columns = df.select_dtypes(include='number').columns

# Impute missing values with the mean of each numeric column
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())

# Drop non-numeric columns
df = df.drop(columns=['City', 'Date'])
df.dropna(inplace=True)

# Feature engineering
def calculate_si(so2):
    si=0
    if (so2<=40):
     si= so2*(50/40)
    if (so2>40 and so2<=80):
     si= 50+(so2-40)*(50/40)
    if (so2>80 and so2<=380):
     si= 100+(so2-80)*(100/300)
    if (so2>380 and so2<=800):
     si= 200+(so2-380)*(100/800)
    if (so2>800 and so2<=1600):
     si= 300+(so2-800)*(100/800)
    if (so2>1600):
     si= 400+(so2-1600)*(100/800)
    return si
df['si']=df['SO2'].apply(calculate_si)
data= df[['SO2','si']]
data.head()

def calculate_no2_si(no2):
    si = 0
    if no2 <= 40:
        si = no2 * (50 / 40)
    elif no2 <= 80:
        si = 50 + (no2 - 40) * (50 / 40)
    elif no2 <= 180:
        si = 100 + (no2 - 80) * (100 / 100)
    elif no2 <= 280:
        si = 200 + (no2 - 180) * (100 / 100)
    elif no2 <= 400:
        si = 300 + (no2 - 280) * (100 / 120)
    else:
        si = 400 + (no2 - 400) * (100 / 120)
    return si
df['no2i']=df['NO2'].apply(calculate_si)
data= df[['NO2','no2i']]
data.head()



Unnamed: 0,NO2,no2i
28,28.71,35.8875
29,28.68,35.85
30,32.66,40.825
31,42.08,52.6
32,35.31,44.1375


In [3]:
def calculate_o3_si(o3):
    si = 0
    if o3 <= 50:
        si = o3 * (50 / 50)
    elif o3 <= 100:
        si = 50 + (o3 - 50) * (50 / 50)
    elif o3 <= 168:
        si = 100 + (o3 - 100) * (100 / 68)
    elif o3 <= 208:
        si = 200 + (o3 - 168) * (100 / 40)
    elif o3 <= 748:
        si = 300 + (o3 - 208) * (100 / 540)
    else:
        si = 400 + (o3 - 748) * (100 / 540)
    return si
df['o3i']=df['O3'].apply(calculate_si)
data= df[['O3','o3i']]
data.head()

def calculate_pm10_si(pm10):
    si = 0
    if pm10 <= 50:
        si = pm10 * (50 / 50)
    elif pm10 <= 100:
        si = 50 + (pm10 - 50) * (50 / 50)
    elif pm10 <= 250:
        si = 100 + (pm10 - 100) * (100 / 150)
    elif pm10 <= 350:
        si = 200 + (pm10 - 250) * (100 / 100)
    elif pm10 <= 430:
        si = 300 + (pm10 - 350) * (100 / 80)
    else:
        si = 400 + (pm10 - 430) * (100 / 80)
    return si
df['pm10i']=df['PM10'].apply(calculate_si)
data= df[['PM10','pm10i']]
data.head()

def calculate_no_si(no):
    si = 0
    if no <= 40:
        si = no * (50 / 40)
    elif no <= 80:
        si = 50 + (no - 40) * (50 / 40)
    elif no <= 180:
        si = 100 + (no - 80) * (100 / 100)
    elif no <= 280:
        si = 200 + (no - 180) * (100 / 100)
    elif no <= 400:
        si = 300 + (no - 280) * (100 / 120)
    else:
        si = 400 + (no - 400) * (100 / 120)
    return si
df['noi']=df['NO'].apply(calculate_si)
data= df[['NO','noi']]
data.head()


def calculate_nh3_si(nh3):
    si = 0
    if nh3 <= 200:
        si = nh3 * (50 / 200)
    elif nh3 <= 400:
        si = 50 + (nh3 - 200) * (50 / 200)
    elif nh3 <= 800:
        si = 100 + (nh3 - 400) * (100 / 400)
    elif nh3 <= 1200:
        si = 200 + (nh3 - 800) * (100 / 400)
    elif nh3 <= 1800:
        si = 300 + (nh3 - 1200) * (100 / 600)
    else:
        si = 400 + (nh3 - 1800) * (100 / 600)
    return si
df['nh3i']=df['NH3'].apply(calculate_si)
data= df[['NH3','nh3i']]
data.head()

def calculate_nox_si(nox):
    si = 0
    if nox <= 40:
        si = nox * (50 / 40)
    elif 40 < nox <= 80:
        si = 50 + (nox - 40) * (50 / 40)
    elif 80 < nox <= 180:
        si = 100 + (nox - 80) * (100 / 100)
    elif 180 < nox <= 280:
        si = 200 + (nox - 180) * (100 / 100)
    elif 280 < nox <= 400:
        si = 300 + (nox - 280) * (100 / 120)
    elif nox > 400:
        si = 400 + (nox - 400) * (100 / 400)
    return si
df['noxi']=df['NOx'].apply(calculate_si)
data= df[['NOx','noxi']]
data.head()

def calculate_co_si(co):
    si = 0
    if co <= 1:
        si = co * (50 / 1)
    elif 1 < co <= 2:
        si = 50 + (co - 1) * (50 / 1)
    elif 2 < co <= 10:
        si = 100 + (co - 2) * (100 / 8)
    elif 10 < co <= 17:
        si = 200 + (co - 10) * (100 / 7)
    elif 17 < co <= 34:
        si = 300 + (co - 17) * (100 / 17)
    elif co > 34:
        si = 400 + (co - 34) * (100 / 66)
    return si

def calculate_o3_si(o3):
    si = 0
    if o3 <= 50:
        si = o3 * (50 / 50)
    elif 50 < o3 <= 100:
        si = 50 + (o3 - 50) * (50 / 50)
    elif 100 < o3 <= 168:
        si = 100 + (o3 - 100) * (100 / 68)
    elif 168 < o3 <= 208:
        si = 200 + (o3 - 168) * (100 / 40)
    elif 208 < o3 <= 748:
        si = 300 + (o3 - 208) * (100 / 539)
    elif o3 > 748:
        si = 400 + (o3 - 748) * (100 / 254)
    return si

def calculate_benzene_si(benzene):
    si = 0
    if benzene <= 5:
        si = benzene * (50 / 5)
    elif 5 < benzene <= 10:
        si = 50 + (benzene - 5) * (50 / 5)
    elif 10 < benzene <= 15:
        si = 100 + (benzene - 10) * (100 / 5)
    elif 15 < benzene <= 20:
        si = 200 + (benzene - 15) * (100 / 5)
    elif 20 < benzene <= 25:
        si = 300 + (benzene - 20) * (100 / 5)
    elif benzene > 25:
        si = 400 + (benzene - 25) * (100 / 25)
    return si

def calculate_toluene_si(toluene):
    si = 0
    if toluene <= 5:
        si = toluene * (50 / 5) 
    elif 5 < toluene <= 10:
        si = 50 + (toluene - 5) * (50 / 5)
    elif 10 < toluene <= 20:
        si = 100 + (toluene - 10) * (100 / 10)
    elif 20 < toluene <= 30:
        si = 200 + (toluene - 20) * (100 / 10)
    elif 30 < toluene <= 40:
        si = 300 + (toluene - 30) * (100 / 10)
    elif toluene > 40:
        si = 400 + (toluene - 40) * (100 / 10)
    return si

def calculate_xylene_si(xylene):
    si = 0
    if xylene <= 5:
        si = xylene * (50 / 5)
    elif 5 < xylene <= 10:
        si = 50 + (xylene - 5) * (50 / 5)
    elif 10 < xylene <= 15:
        si = 100 + (xylene - 10) * (100 / 5)
    elif 15 < xylene <= 20:
        si = 200 + (xylene - 15) * (100 / 5)
    elif 20 < xylene <= 25:
        si = 300 + (xylene - 20) * (100 / 5)
    elif xylene > 25:
        si = 400 + (xylene - 25) * (100 / 25)
    return si

df['xylenei']=df['Xylene'].apply(calculate_si)
data= df[['Xylene','xylenei']]
data.head()

df['toluenei']=df['Toluene'].apply(calculate_si)
data= df[['Toluene','toluenei']]
data.head()

df['benzenei']=df['Benzene'].apply(calculate_si)
data= df[['Benzene','benzenei']]
data.head()

df['coi']=df['CO'].apply(calculate_si)
data= df[['CO','coi']]
data.head()


Unnamed: 0,CO,coi
28,6.93,8.6625
29,13.85,17.3125
30,24.39,30.4875
31,43.48,54.35
32,54.56,68.2


In [4]:

def calculate_pm25_si(pm25):
    si = 0
    if pm25 <= 30:
        si = pm25 * (50 / 30)
    elif pm25 > 30 and pm25 <= 60:
        si = 50 + (pm25 - 30) * (50 / 30)
    elif pm25 > 60 and pm25 <= 90:
        si = 100 + (pm25 - 60) * (100 / 30)
    elif pm25 > 90 and pm25 <= 120:
        si = 200 + (pm25 - 90) * (100 / 30)
    elif pm25 > 120 and pm25 <= 250:
        si = 300 + (pm25 - 120) * (100 / 130)
    elif pm25 > 250:
        si = 400 + (pm25 - 250) * (100 / 250)
    return si
df['pm25i']=df['PM2.5'].apply(calculate_pm25_si)
data= df[['PM2.5','pm25i']]
data.head()

def AQI_Range(x):
    if x <= 50:
        return "Good"
    elif x <= 100:
        return "Satisfactory"
    elif x <= 200:
        return "Moderate"
    elif x <= 300:
        return "Poor"
    elif x <= 400:
        return "Very Poor"
    elif x > 400:
        return "Severe"
    else:
        return np.NaN
    
df['AQI_Bucket']=df['AQI'].apply(AQI_Range)


df['AQI_Bucket'].value_counts()

# Assuming similar functions for other pollutants

# Select features and target
X = df[['o3i', 'pm10i', 'noi', 'nh3i', 'noxi', 'xylenei', 'toluenei', 'benzenei', 'coi', 'pm25i', 'noxi', 'si', 'no2i']]
y = df['AQI']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=70)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model selection and tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_regressor = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf_regressor, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

best_rf_regressor = grid_search.best_estimator_

# Model evaluation
train_preds = best_rf_regressor.predict(X_train_scaled)
test_preds = best_rf_regressor.predict(X_test_scaled)

train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

train_r2 = r2_score(y_train, train_preds)
test_r2 = r2_score(y_test, test_preds)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)
print("Train R^2 Score:", train_r2)
print("Test R^2 Score:", test_r2)


KeyboardInterrupt: 