# AQI Prediction Model using Python

- PM2.5 PM10
- NO, NO2
- NH3 - Ammonia
- CO
- So2
- O3
- Benzene, Toluene, Xylene

In [1]:
# pip install numpy pandas matplotlib seaborn scikit-learn

In [2]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv('air quality data.csv')
df.head() # Top 5 rows!

FileNotFoundError: [Errno 2] No such file or directory: 'air quality data.csv'

In [None]:
# Shape - rows and cols!
df.shape

In [None]:
# Information
df.info()

In [None]:
# to know the duplicate values
df.duplicated().sum()

In [None]:
# To check missing values
df.isnull().sum()

In [None]:
# Drop the rows where 'AQI' has missing values
df.dropna(subset=['AQI'], inplace = True)

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.shape

In [None]:
# Summary of Statistics in the dataset
df.describe().T

In [None]:
# Percentage of the null values
null_values_percentage = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)
null_values_percentage

#### Key Considerations:
- Xylene has the highest percentage of missing values - 61.86%
- PM10 and NH3 28 - 26 %

## Week 2 - Visualization

In [None]:
# Univariate analysis
df['Xylene'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['PM10'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['NH3'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['Toluene'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['Benzene'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['NOx'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['O3'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['PM2.5'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['SO2'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['CO'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
df['AQI'].plot(kind = 'hist', figsize=(10,5))
plt.legend()
plt.show()

In [None]:
# Distribution of AQi from 2015 to 2020
sns.displot(df, x='AQI', color='purple')
plt.show()

In [None]:
# Bivariate
sns.set_theme(style="darkgrid")
graph = sns.catplot(x="City", kind='count', data=df, height=5, aspect=3)
graph.set_xticklabels(rotation=90)

In [None]:
sns.set_theme(style="darkgrid")
graph = sns.catplot(x="City", kind='count', data=df, col="AQI_Bucket", col_wrap=2, 
                    height=3.5, aspect=3)
graph.set_xticklabels(rotation=90)

In [None]:
graph1 = sns.catplot(x='City', y='PM2.5', kind='box', data=df, height=5, aspect=3)
graph1.set_xticklabels(rotation=90)

In [None]:
graph2 = sns.catplot(x='City', y='NO2', kind='box', data=df, height=5, aspect=3)
graph2.set_xticklabels(rotation=90)

In [None]:
graph3 = sns.catplot(x='City', y='O3', kind='box', data=df, height=5, aspect=3)
graph3.set_xticklabels(rotation=90)

In [None]:
graph4 = sns.catplot(x='City', y='SO2', kind='box', data=df, height=5, aspect=3)
graph4.set_xticklabels(rotation=90)

In [None]:
graph5 = sns.catplot(x='AQI_Bucket', data=df, kind='count', height=6, aspect=3)
graph5.set_xticklabels(rotation=90)

In [None]:
# TO check the null values
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.describe().loc['mean']

In [None]:
df = df.replace({
    "PM2.5":{np.nan:67.476613},
    "PM10":{np.nan:118.454435},
    "NO": {np.nan:17.622421},
    "NO2": {np.nan:28.978391},
    "NOx": {np.nan:32.289012},
    "NH3": {np.nan:23.848366},
    "CO":  {np.nan:2.345267},
    "SO2": {np.nan:34.912885},
    "O3": {np.nan:38.320547},
    "Benzene": {np.nan:3.458668},
    "Toluene": {np.nan:9.525714},
    "Xylene": {np.nan:3.588683}
})

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['AQI_Bucket'], axis=1)

In [None]:
df.head()

In [None]:
sns.boxplot(data=df[['PM2.5', 'PM10']])

In [None]:
sns.boxplot(data=df[['NO', 'NO2', 'NOx', 'NH3']])

In [None]:
sns.boxplot(data=df[['O3', 'SO2']])

In [None]:
# IQR Method - Q3 Q1
def replace_outliers(df):
    for column in df.select_dtypes(include=['number']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lb = Q1 - 1.5 * IQR
        ub = Q3 + 1.5 * IQR
        df[column] = df[column].apply(
            lambda x: Q1 if x < lb else (Q3 if x > ub else x)
        )
    return df

In [None]:
df = replace_outliers(df)

In [None]:
df.describe().T

In [None]:
sns.boxplot(data=df[['PM2.5', 'PM10']])

In [None]:
sns.boxplot(data=df[['O3', 'SO2']])

In [None]:
sns.displot(df, x='AQI', color='red')
plt.show()

In [None]:
df1 = df.drop(columns=['City'])

In [None]:
# Multivariate Analysis - Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df1.corr(), annot=True, cmap='Pastel1')
plt.show()

# Week 3 - Data Modeling - 10 March 2025

In [None]:
df.drop(['Date', 'City'], axis = 1, inplace=True)
df.head()

In [None]:
# Scaling - Standard Scaler
from sklearn.preprocessing import StandardScaler
df1 = StandardScaler().fit_transform(df)
df1

In [None]:
df = pd.DataFrame(df1, columns=df.columns)
df.head()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df.columns

In [None]:
# Feature & Target Selection
X = df[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'Xylene']]
y = df['AQI']

In [None]:
X.head()

In [None]:
# Split the data into training and testing data - Training set - 80% | Testing set - 20%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Shape of X Train',X_train.shape)
print('Shape of X Test',X_test.shape)
print('Shape of y Train',y_train.shape)
print('Shape of y Test',y_test.shape)

In [None]:
# Linear Regression Model
LR = LinearRegression()
LR.fit(X_train, y_train)

In [None]:
# Predicting the values:
train_pred = LR.predict(X_train) # Predicting train
test_pred = LR.predict(X_test) # Predicting test

In [None]:
# Evaluation for Linear Regression
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'* 60)
print('R Squared value for Train = ', LR.score(X_train, y_train))
print('R Squared value on Test = ', LR.score(X_test, y_test))

In [None]:
# KNN
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

In [None]:
# Predicting the values:
train_pred = knn.predict(X_train) # Predicting train
test_pred = knn.predict(X_test) # Predicting test

# Evaluation for KNN
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'* 60)
print('R Squared value for Train = ', knn.score(X_train, y_train))
print('R Squared value on Test = ', knn.score(X_test, y_test))

In [None]:
# Decision Tree
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

In [None]:
# Predicting the values:
train_pred = dtr.predict(X_train) # Predicting train
test_pred = dtr.predict(X_test) # Predicting test

# Evaluation for Decision Tree Regressor
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'* 60)
print('R Squared value for Train = ', dtr.score(X_train, y_train))
print('R Squared value on Test = ', dtr.score(X_test, y_test))

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
# Predicting the values:
train_pred = rfr.predict(X_train) # Predicting train
test_pred = rfr.predict(X_test) # Predicting test

# Evaluation for Randome Forest Regressor
RMSE_train = (np.sqrt(mean_squared_error(y_train, train_pred)))
RMSE_test = (np.sqrt(mean_squared_error(y_test, test_pred)))
print('RMSE Train Data = ', str(RMSE_train))
print('RMSE Test Data = ', str(RMSE_test))
print('_'* 60)
print('R Squared value for Train = ', rfr.score(X_train, y_train))
print('R Squared value on Test = ', rfr.score(X_test, y_test))