In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import math
import seaborn as sns
import time 

In [None]:
#loading the data.
df=pd.read_csv('../input/pune-air-quality-index/PNQ_AQI.csv')
df.head()

In [None]:
#lets see the missing values and info.
df.info()

In [None]:
#the co2 column is empty ,so lets drop that.
df.drop('CO2 µg/m3',axis=1,inplace=True)
#making the date column a datetime datatype.
df['Date']=pd.to_datetime(df['Date'])


In [None]:
#there are only 6 locations ,but have been represented by diffrent names.Lets replace them.
rep={'MPCB-KR':'Karve Road','MPCB-SWGT':'Swargate','MPCB-BSRI':'Bhosari','MPCB-NS':'Nal Stop','MPCB-PMPR':'Pimpri','Pimpri Chinchwad':'Chinchwad'}

df['Location'].replace(rep,inplace=True)

In [None]:
#now as we have some missing values ,lets fill them.
#we will use ffill (forward fill ) and bfill() to fill empty values by the values that occur
#after or before missing value.
df.sort_values(by='Date',inplace=True)
df=df.ffill().bfill()
df.info()

In [None]:
#SO2 and NOx column have dtype object,converting them to float.
df['SO2 µg/m3']=pd.to_numeric(df['SO2 µg/m3'],errors='coerce')
df['SO2 µg/m3']=df['SO2 µg/m3'].ffill().bfill()


df['Nox µg/m3']=pd.to_numeric(df['Nox µg/m3'],errors='coerce')
df['Nox µg/m3']=df['Nox µg/m3'].ffill().bfill()

df.info()

In [None]:
#lets add a new feature that represents the WHO rating of AQI according to its values.
df['AQI Rating']=pd.cut(df['AQI'],bins=[0,50,100,150,200,300,df['AQI'].max()]
                        ,labels=['good','moderate','moderately unhealth','unhealthy','very unhealthy','Hazardous'])
df.head()

In [None]:
#Count of the AQI Quality over the dat
plt.style.use('Solarize_Light2')
plt.rcParams['font.size']=10
locations=[x for x in df['Location'].unique()]
fig,ax=plt.subplots(3,2,figsize=(16,16),constrained_layout=True)
n=0
for i in range(3):
    for j in range(2):
            sns.countplot(df[df['Location']==locations[n]]['AQI Rating'],ax=ax[i,j])
            ax[i,j].set_title(locations[n],y=1,loc='right')
            ax[i,j].set_xlabel('')
            ax[i,j].set_ylabel('Num_Observations',size=10)
            n+=1
plt.show()


In [None]:
#lets see the distribution of AQI
plt.style.use('ggplot')
fig,ax=plt.subplots(figsize=(14,6))
for key,grp in df.groupby('Location'):
    sns.distplot(grp['AQI'],label=key,hist=False,ax=ax)
plt.title('Distribution plots of AQI')
plt.xlabel('AQI')
plt.ylabel('Distribution')
plt.legend()


In [None]:
#as we there were missing values for spm,showing dist plots for SO2 and RSPM and NOx
fig,ax=plt.subplots(figsize=(14,6))
for key,grp in df.groupby('Location'):
    sns.distplot(grp['SO2 µg/m3'],label=key,hist=False,ax=ax)
plt.title('Distribution plots of So2')
plt.xlabel('SO2 ')
plt.ylabel('Distribution')
plt.legend()


fig,ax=plt.subplots(figsize=(14,6))
for key,grp in df.groupby('Location'):
    sns.distplot(grp['RSPM µg/m3'],label=key,hist=False,ax=ax)
plt.title('Distribution plots of RSPM')
plt.xlabel('RSPM ')
plt.ylabel('Distribution')
plt.legend()

fig,ax=plt.subplots(figsize=(14,6))
for key,grp in df.groupby('Location'):
    sns.distplot(grp['Nox µg/m3'],label=key,hist=False,ax=ax)
plt.title('Distribution plots of Nox µg/m3')
plt.xlabel('Nox µg/m3')
plt.ylabel('Distribution')
plt.legend()


In [None]:
#lets plot a overall aqi for each location


plt.style.use('seaborn-dark')
fig,ax=plt.subplots(figsize=(16,12))
for key,grp in df.groupby('Location'):
    ax.plot(grp['Date'],grp['AQI'],linewidth=.5,label=key)

    
#annotating month of max mean avg AQI
for year in df['Date'].dt.year.unique():
    df_y=df[df['Date'].dt.year==year]
    for mon,group in df_y.groupby(df['Date'].dt.month):
        d={}
        avg_aqi=grp['AQI'].mean()
        d[mon]=avg_aqi

    a=df_y[df_y['Date'].dt.month==max(d,key =lambda x:d[x])]
    a=a.reset_index().iloc[round(len(a)/2)]
#     b=df_y[df_y['Date'].dt.month==min(d,key =lambda x:d[x])] 
#use b for showing month of min mean aqi

    x,y=a['Date'],a['AQI']
    ax.annotate('High m/y = {}/{}'.format(a['Date'].month,a['Date'].year)
                ,size=7,xy=(x,y+10),xytext=(x,y+200)
                ,arrowprops=dict(arrowstyle='-> ,head_width=.7',lw=.7,color='black'),ha='center',va='top')
plt.legend()
plt.title(('AQI over the years'))
plt.xlabel('Year')
plt.ylabel('AQI')


The plot shows that the worst AQI averaged over the month is almost always falls in month of December.It follows a upward trend in winters ,peaks in December.AQI shows decline after
winter months and reaches the best values in the monsoon months.

In [None]:
#lets draw mean aqifor each month over the data
d={}
for key,grp in df.groupby('Location'):
    a=grp.groupby(grp['Date'].dt.month)['AQI'].mean()
    d[key]=a
monthly_avg=pd.DataFrame(d)
monthly_avg.index.name='Months'
months={1:'Jan',2:'Feb',3:'Mar',4:'Apr',5:'May',6:'June',7:'July',8:'Aug',9:'Sep',10:'Oct',11:'Nov',12:'Dec'}
monthly_avg.reset_index(inplace=True)
monthly_avg['Months'].replace(months,inplace=True)
monthly_avg.set_index('Months',inplace=True)
monthly_avg

In [None]:
#plotting on the above data 
plt.style.use('ggplot')
monthly_avg.plot(figsize=(16,10))
plt.title('Mean AQI for each Location')
plt.ylabel('Mean AQI')
plt.xlabel('Month')


The plot shows how the values of AQI peak during the winter months,fall during summer months and are lowest in monsoon months. 