In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
import seaborn as sns
from scipy.stats import skew

#### Analyzing the Indian Air Quality Data For each city-day

In [None]:
df=pd.read_csv('../input/air-quality-data-in-india/city_day.csv',parse_dates=['Date'])
df.head()

In [None]:
df.describe()

In [None]:
#here we calculate the proportion of null values in each of the features

missing_values=pd.DataFrame(df.isna().sum()/len(df))
missing_values.columns=['Proportion']
missing_values=missing_values.sort_values(by='Proportion',ascending=False)
missing_values.style.background_gradient(cmap='Blues')

In [None]:
rcParams['axes.spines.top']=False
rcParams['axes.spines.right']=False


rcParams['figure.dpi']=300

rcParams['figure.autolayout']=True

rcParams['font.style']='normal'
rcParams['font.size']=4

rcParams['lines.linewidth']=0.7


rcParams['xtick.labelsize']=4
rcParams['ytick.labelsize']=4


In [None]:
#Grouping the AQI by city and calculating the average AQI per city
x=pd.DataFrame(df.groupby(['City'])[['AQI']].mean().sort_values(by='AQI',ascending=False).head(10))
x=x.reset_index('City')

#plotting the average AQI per city
plt.style.use('seaborn-whitegrid')
plt.figure(figsize=(3,1.5))
sns.barplot(data=x,x='AQI',y='City',orient='h',palette='viridis')

In [None]:
#segregating dates into months and years

df['Month']=df.Date.dt.month.astype(str)
df['Year']=df.Date.dt.year.astype(str)

In [None]:
#line plot analysis for amount of particulate matter and gases  over the years
cols=['PM2.5','PM10','NO2','NOx','NH3',
      'CO','SO2','O3','Benzene','Toluene','Xylene']

x=df.iloc[:,2:]
fig=plt.figure(figsize=(3.2,6.5))
for i,col  in enumerate(cols):
    fig.add_subplot(6,2,i+1)
    sns.lineplot(x='Year',y=col,data=x)



In [None]:
#distribuiton of various gases and particulate in air

cols=['PM2.5','PM10','NO2','NOx','NH3',
      'CO','SO2','O3','Benzene','Toluene','Xylene']

fig=plt.figure(figsize=(3,6.5))

for i,col in enumerate(cols):
    fig.add_subplot(6,2,i+1)
    plt.hist(df[col].dropna(),bins=20,edgecolor='white')
    plt.xlabel(col)
    plt.ylabel('Count')

In [None]:
#correlation analysis

plt.figure(figsize=(3,2))

mask=np.triu(df.corr(method='pearson'))
sns.heatmap(df.corr(method='pearson'),
            annot=True,fmt='0.1f',
            mask=mask,
            robust=True,
            cmap='pink')
plt.title('Correlation Analysis')

In [None]:
#analyzing skewness in data
#here we observe high skewness in Bezene,Toluene,CO and Xylene

skew=pd.DataFrame(df.skew().sort_values(ascending=False))
skew.columns=['Skewness']

plt.figure(figsize=(2.5,1.8))
plt.xticks(rotation=75)
sns.barplot(x=skew.index,y='Skewness',data=skew,
            edgecolor='black',
            linewidth=0.2,
            palette='viridis_r')
plt.xlabel('Features')

In [None]:
#plotting the average AQI over the years for top 5 cities which have AQI greater than 500

df['City'][df['AQI']>=500].value_counts()

cols=['Ahmedabad','Delhi','Patna','Gurugram','Lucknow']

for col in cols:
    plt.figure(figsize=(2,1.5))
    #selecting data pertaining to the selected city
    x=df[df['City']==col]
    sns.barplot(x='Year',y='AQI',data=x)
    plt.title(col)

#### Below pie charts are easily predictable because from the first bar chart above we see a higher AQI for Ahmedabad followed by Delhi ,Gurugram etc.Thus the amount of particulate matter as well as gases which significantly contribute towards air pollution is higher for cities like Delhi and Ahmedabad 

In [None]:
#cols represent particulate matter Carbon monoxide which have a correlation >=0.5 with AQI and thus 
#are responsible of increasing AQI of cities
cols=['PM2.5','PM10','CO','NO','NO2']

cmap=plt.get_cmap('Spectral')
color=[cmap(i) for i in np.linspace(0,1,8)]
explode=[0.2,0,0,0,0,0,0,0]

for col in cols:
    plt.figure(figsize=(2.8,1.8))
    
    '''grouping above columns by cities and 
    taking 8 cities which have the highest sum'''
    
    x=df.groupby('City')[col].sum().sort_values(ascending=False)
    x.reset_index('City')
    x[:8].plot.pie(shadow=True,autopct='%1.1f%%',
                   colors=color,explode=explode,
                   wedgeprops={'edgecolor':'black','linewidth':0.3}
                   )

In [None]:
'''here we are not considering CO because square 
   root of CO can result in changes in its correlation with
   AQI. Thus we are taking sqrt of only those features which do not have
   much correlation with AQI
   '''
cols=['O3','NH3','Benzene','Toluene','Xylene']
for col in cols:
    df[col]=np.sqrt(df[col])

#### Below we observe that november month has seen the highest value of AQI which means the air quality was the poorest in this month

In [None]:
#from above bar plots we see that for Ahmedabad highest avearge AQI is for the year 2018 so let's 
#plot the monthly distribution of AQI for Ahmedabad for the year 2018

x=df[(df['City']=='Ahmedabad') & (df['Year']=='2018')]

plt.figure(figsize=(2.5,2))
sns.barplot(x='Month',y='AQI',data=x)

#### From above heatmap we observe a strong correlation of NO with NOx and of Benzene with Toluene so lets visualize them with the help of scatterplots

In [None]:
plt.figure(figsize=(2.3,1.5))
sns.scatterplot(x='NO',y='NOx',data=df,s=8,color='green')

#### Below we observe the points are crowded between values ranging from 0-10

In [None]:
plt.figure(figsize=(2.3,1.5))
sns.scatterplot(x='Benzene',y='Toluene',data=df,s=8)

In [None]:
'''bar plot illustrating which of the cities have AQI less 
  than 50 and plotting their value count.From this we can infer which of the cities 
   have a relatively good air quality'''

x=pd.DataFrame(df['City'][df['AQI']< 50].value_counts())
x=x.rename(columns={'City':'Count'})

plt.figure(figsize=(3,1.8))
sns.barplot(x='Count',y=x.index,data=x,palette='viridis_r')


#### From above plot we infer that Amravati followed by Hyderabad have a higher count where their AQI is less than 50 . 

In [None]:

x=df[df['City']=='Amaravati']
x1=df[df['City']=='Ahmedabad']


plt.style.use('seaborn-whitegrid')
fig,ax=plt.subplots(2,1)
fig.set_size_inches(2.5,2.5)
ax[0]=sns.lineplot(x='Year',y='AQI',data=x,ax=ax[0],color='orange')
ax[1]=sns.lineplot(x='Year',y='AQI',data=x1,ax=ax[1])
ax[0].set_title('AQI distribuiton of Amravati over Years')
ax[1].set_title('AQI distribuiton of Ahmedabad over Years')
    

#### From above plots its easy to infer that over years AQI of Amaravati has seen a decrease as compared to Ahmedabad.However for Ahmedabad we se a drastic fall in AQI towards 2020.Primary reason behind this is due to covid, cities were less crowded and as a result almost all cities have shown an imporvement in air quality in the year 2020

### Do upvote if you find it useful:)