In [1]:
import pandas as pd
import seaborn as sb
import numpy as np
import matplotlib.pyplot as plt
import missingno as mn
import plotly.express as pex
from sklearn.impute import KNNImputer
# Disable warnings 
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'sklearn'

In [None]:
link = "https://github.com/AminRane/AQI/blob/main/city_day.csv?raw=true"
df = pd.read_csv(link)
df

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
def createNullMatrix(city, columns = None):
    if columns is None: 
        mn.matrix(city)
    elif columns.lower() == 'pollutants':
        mn.matrix(city[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3']])

In [None]:
fig = pex.box(df['AQI'].astype(float))
fig.show()

In [None]:
df = df.drop(['AQI', 'AQI_Bucket'], axis=1)

In [None]:
createNullMatrix(df, columns = 'pollutants')

In [None]:
knn_imputer = KNNImputer(n_neighbors = 20)
imputing_cols = [ 'PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3']
knn_imputer.fit(df[imputing_cols])
imputed = knn_imputer.transform(df[imputing_cols])
df.loc[:, imputing_cols] = imputed

In [None]:
createNullMatrix(df, columns = 'pollutants')

In [None]:
def get_PM25_subindex(x):
    if 0 <= x <= 30:
        return (50 / 30) * x
    elif x <= 60:
        return ((100-51) / (60-31)) * (x - 31) + 51
    elif x <= 90:
        return ((200-101) / (90-61)) * (x - 61) + 101
    elif x <= 120:
        return ((300-201) / (120-91)) * (x - 91) + 201
    elif x <= 250:
        return ((400-301) / (250-121)) * (x - 121) + 301
    elif x > 250:
        return ((500-401) / (1000-251)) * (x - 251) + 401
    else:
        return 0

In [None]:
def get_CO_subindex(x):
    if 0 <= x <= 1:
        return 50 * x
    elif x <= 2:
        return ((100-51) / (2-1.1)) * (x - 1.1) + 51
    elif x <= 10:
        return ((200-101) / (10-2.1)) * (x - 2.1) + 101
    elif x <= 17:
        return ((300-201) / (17-10.1)) * (x - 10.1) + 201
    elif x <= 34:
        return ((400-301) / (34-17.1)) * (x - 17.1) + 301
    elif x > 34:
        return ((500-401) / (100-34.1)) * (x - 34.1) + 401
    else:
        return 0

In [None]:
def get_PM10_subindex(x):
    if 0 <= x <= 50:
        return x
    elif x <= 100:
        return ((100-51) / (100-51)) * (x - 51) + 51
    elif x <= 250:
        return ((200-101) / (250-101)) * (x - 101) + 101
    elif x <= 350:
        return ((300-201) / (350-251)) * (x - 251) + 201
    elif x <= 430:
        return ((400-301) / (430-351)) * (x - 351) + 301
    elif x > 430:
        return ((500-401) / (1000-431)) * (x - 431) + 401
    else:
        return 0

In [None]:
def get_O3_subindex(x):
    if 0 <= x <= 50:
        return x
    elif x <= 100:
        return ((100-51) / (100-51)) * (x - 51) + 51
    elif x <= 168:
        return ((200-101) / (168-101)) * (x - 101) + 101
    elif x <= 208:
        return ((300-201) / (208-169)) * (x - 169) + 201
    elif x <= 748:
        return ((400-301) / (748-209)) * (x - 209) + 301
    elif x > 748:
        return ((500-401) / (2000-748)) * (x - 209) + 401
    else:
        return 0

In [None]:
def get_NO2_subindex(x):
    if 0 <= x <= 40:
        return (50/40)*x
    elif x <= 80:
        return ((100-51) / (80-41)) * (x - 41) + 51
    elif x <= 180:
        return ((200-101) / (180-81)) * (x - 81) + 101
    elif x <= 280:
        return ((300-201) / (280-181)) * (x - 181) + 201
    elif x <= 281:
        return ((400-301) / (400-281)) * (x - 281) + 301
    elif x > 281:
        return ((500-401) / (1000-431)) * (x - 431) + 401
    else:
        return 0

In [None]:
def get_SO2_subindex(x):
    if 0 <= x <= 40:
        return (50/40)*x
    elif x <= 80:
        return ((100-51) / (80-41)) * (x - 41) + 51
    elif x <= 380:
        return ((200-101) / abs(81-380)) * (x - 81) + 101
    elif x <= 800:
        return ((300-201) / abs(381-800)) * (x - 381) + 201
    elif x <= 1600:
        return ((400-301) / abs(801-1600)) * (x - 801) + 301
    elif x > 1600:
        return ((500-401) / abs(1600-3000)) * (x - 1600) + 401
    else:
        return 0

In [None]:
def get_NH3_subindex(x):
    if 0 <= x <= 200:
        return (50/200) * x
    elif x <= 400:
        return ((100-51) / abs(201-400)) * (x - 201) + 51
    elif x <= 800:
        return ((200-101) / abs(401-800)) * (x - 401) + 101
    elif x <= 1200:
        return ((300-201) / abs(801-1200)) * (x - 801) + 201
    elif x <= 1800:
        return ((400-301) / abs(1201-1800)) * (x - 1201) + 301
    elif x > 1800:
        return ((500-401) / abs(1801-3000)) * (x - 1801) + 401
    else:
        return 0

In [None]:
def calcSubIndex(city):
    city["PM25_subindex"] = city["PM2.5"].apply(lambda x: get_PM25_subindex(x))
    city["CO_subindex"] = city["CO"].apply(lambda x: get_CO_subindex(x))
    city["PM10_subindex"] = city["PM10"].apply(lambda x: get_PM10_subindex(x))
    city["O3_subindex"] = city["O3"].apply(lambda x: get_O3_subindex(x))
    city["NO2_subindex"] = city["NO2"].apply(lambda x: get_NO2_subindex(x))
    city["SO2_subindex"] = city["SO2"].apply(lambda x: get_SO2_subindex(x))
    city["NH3_subindex"] = city["NH3"].apply(lambda x: get_NH3_subindex(x))

In [None]:
def calcAQI(city):
    calcSubIndex(city)
    city['AQI'] = city[['PM25_subindex', 'CO_subindex', 'PM10_subindex', 'O3_subindex',
       'NO2_subindex', 'SO2_subindex', 'NH3_subindex']].max(axis = 1)

In [None]:
calcAQI(df)

In [None]:
def dateTime(city):
    city['Datetime'] = pd.to_datetime(city['Date'])
    city['Year'] = city['Datetime'].dt.year
    city['Month'] = city['Datetime'].dt.month
    del city['Date']

In [None]:
dateTime(df)

In [None]:
def get_scale(x):
    if 0 <= x <= 50: return "Good"
    elif x <=100: return "Satisfactory"
    elif x <=200: return "Moderate"
    elif x <=300: return "Poor"
    elif x <=400: return "Very Poor"
    else: return "Hazardous"

In [None]:
def calcAQIScale(city):
    city["AQI_Scale"] = city["AQI"].apply(lambda x: get_scale(x))

In [None]:
calcAQIScale(df)

In [None]:
df.columns

In [None]:
df = df.drop(['PM25_subindex', 'CO_subindex',
       'PM10_subindex', 'O3_subindex', 'NO2_subindex', 'SO2_subindex',
       'NH3_subindex'], axis = 1)

In [None]:
df.corr()

In [None]:
plt.figure(figsize=(20,10))
sb.heatmap(df.corr(), annot = True)
plt.show()

In [None]:
variables = ['PM2.5','PM10','NO','NO2','NOx','NH3','CO','SO2','O3']

for variable in variables:
    fig = pex.scatter(df, x = 'AQI', y = variable, color='Year', title = 'Relation between {} and AQI'.format(variable))
    fig.show()

In [None]:
city_wise = df.copy()
city_wise = city_wise.groupby(['City'])[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2',
       'O3', 'AQI']].median().reset_index()
for col in city_wise.columns[1:]:
    city_wise[col] = round(city_wise[col],2)

In [None]:
for col in city_wise.columns[1:]:
    fig = pex.bar(city_wise.sort_values(by=col,ascending=True),
                  x="City",y=col,color=col,text=col,title=col,
                 color_continuous_scale=["orange", "red",
                                         "green", "blue",
                                         "purple"])
    fig.update_traces(textposition='outside')
    fig.show()

In [None]:
newDf = df.groupby(['Year','City'])['AQI'].median().reset_index()

In [None]:
fig = pex.histogram(newDf, x = "City", y = "AQI", color = 'Year',
                    color_discrete_sequence=["red", "blue",
                 "orange", "green", "yellow", "purple"], barmode = 'group')
fig.show()

In [None]:
fig=plt.figure(figsize=(20,40))
for i,col in enumerate(variables):
    fig.add_subplot(6,2,i+1)
    sb.lineplot(x='Year',y=col,data=df).set(title=col+' over the Years')

In [None]:
def plotCityAQIScale(city):
    plt.figure(figsize=(10,8))
    x=df[df['City']==city].groupby(['AQI_Scale'])[["AQI"]].count().sort_values("AQI", ascending=False)
    
    explode=[0.2,0,0,0,0,0]
    x['AQI'].plot.pie(shadow=True,autopct='%1.1f%%',
                       explode=explode,
                   wedgeprops={'edgecolor':'black','linewidth':0.3}
                   ).set(title = 'Distribution of Quality of Air in '+city)
    plt.show()
    
print(df['City'].unique())
plotCityAQIScale(input('Enter the city: '))

In [None]:
fig = plt.figure(figsize=(20,20))
for i, year in enumerate([2015,2016,2017,2018,2019,2020]):
  fig.add_subplot(3, 2, i+1)
  leastPolluted = df[df['Year']==year].groupby('City', as_index=False)[['AQI']].mean().sort_values(['AQI']).reset_index()
  leastPolluted = leastPolluted.head(4)
  sb.barplot(x = 'City', y = 'AQI', data = leastPolluted).set(title = '4 Least polluted cities in '+str(year))

In [None]:
fig = plt.figure(figsize=(20,20))
for i, year in enumerate([2015,2016,2017,2018,2019,2020]):
  fig.add_subplot(3, 2, i+1)
  mostPolluted = df[df['Year']==year].groupby('City', as_index=False)[['AQI']].mean().sort_values(['AQI'], ascending = False).reset_index()
  mostPolluted = mostPolluted.head(4)
  sb.barplot(x = 'City', y = 'AQI', data = mostPolluted).set(title = '4 Most polluted cities in '+str(year))

In [None]:
def histAQI(city):
  title = ''
  if city.lower() == 'all':
    data = df
  else:
    data = df[df['City']==city]
    title = 'for ' + city
  fig = pex.histogram(data_frame = data, x = 'AQI', nbins = 100,histnorm = 'probability density', title = 'Histogram of AQI Distribution '+title)
  fig.show()

In [None]:
print(df['City'].unique())
histAQI(input("Enter city name or 'All' for every city: "))

In [None]:
def PollutantsLineOverYears(city, poll = 'AQI'):
  title = ''
  if city.lower() == 'all':
    data = df
  else:
    data = df[df['City']==city]
    title = city
  city = data.groupby(['Year','Month'], as_index=False)[[poll]].mean().reset_index()
  fig = pex.line(data_frame = city, x = 'Month', y = poll, color = 'Year', title = poll+' Distribution over Years: '+title)
  fig.show()

In [None]:
city = input("Enter city name or 'All' for every city: ")
PollutantsLineOverYears(city)

In [None]:
city = input("Enter city name or 'All' for every city: ")
pollutant = input("Enter the pollutant: ")
PollutantsLineOverYears(city, poll = pollutant)

In [None]:
def cityColTrend(city, col):
    data = df[df['City'] == city]
    fig = pex.line(data_frame = data, x = 'Datetime', y = col, title = col + ": " + city)
    fig.show()

In [None]:
cityColTrend('Delhi', 'AQI')

In [None]:
dfLockdown = df[df['Datetime']>'2020-02-20']
print(dfLockdown['Datetime'].max())

In [None]:
new = df[(df['Month'] >= 2) & (df['Month'] < 7)]
new

In [None]:
cols = ['PM2.5', 'PM10', 'NO2', 'NH3', 'CO', 'SO2', 'O3', 'AQI']
groupedData = new.groupby(['Year', 'Month'])[cols].mean().reset_index()
groupedData.head()

In [None]:
for col in cols:
    fig = pex.line(data_frame = groupedData, x = 'Month', y = col, color = 'Year', title = col)
    fig.show()

In [None]:
def MovingNAverage(rows, df):
    df['MovingNAverage'] = 0
    for i in range(rows, len(df)):
        for n in range(1, rows+1):
            df.loc[i, 'MovingNAverage'] += df.loc[i-n, 'AQI']
    df.loc[:, 'MovingNAverage'] /= rows
    df.loc[:rows-1, 'MovingNAverage'] = np.nan

In [None]:
delhi = df[df['City'] == 'Delhi'].reset_index()

In [None]:
MovingNAverage(3, delhi)

plt.figure(figsize = (20,10))
sb.lineplot(delhi.Datetime, delhi.AQI)

plot = sb.lineplot(delhi.Datetime,
             delhi.MovingNAverage, 
              color = 'red')

plot.legend(title='legends', loc='upper left',
           labels=['Actual', 'Prediction By Moving average'])

plot.set_ylabel('AQI index');
plot.set_xlabel('Date');