# Air Quality Index and Viral Load comparison in Indian Background:

According to the article in https://scroll.in/article/978454/coronavirus-the-record-high-number-of-new-cases-in-delhi-is-a-direct-effect-of-air-pollution and theGuardian article (https://www.theguardian.com/world/2020/nov/11/delhi-covid-crisis-worsened-by-soaring-pollution-levels) , Delhi's current conditions in the pandemic has a direct relation with the air pollution as the the Covid-19 virus can directly "piggyback" on the PM 2.5 Particles and more severely.
Here, we will have a look at the Air Quality Index in the past 5 years of different cities of India and conclude on the expected Air pollution in the city of Delhi, in particular.
Further since due to Lockdown, the air pollution was comparatively low in all the cities, we will see the Covid cases in Delhi in particular and if it has any correlation with air quality index. 
Further, based on the pollution trends in different cities(increasing or decreasing ), we can find useful insights about the third wave, expected this year around October. 


In [None]:
import numpy as np
import scipy.stats
import scipy.special
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from matplotlib import cm
import pandas as pd
from sklearn.pipeline import make_pipeline, make_union, Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import ParameterGrid
from keras.models import Sequential
from keras.models import Model as KerasModel
from keras.layers import Input, Dense, Activation, Reshape
from keras.layers import Concatenate
from keras.layers.embeddings import Embedding
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
import pickle
import csv
from datetime import datetime
from sklearn import preprocessing
from keras.callbacks import ModelCheckpoint
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

#Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

#For Missing Value and Feature Engineering
from sklearn.feature_selection import SelectKBest, chi2, f_classif, VarianceThreshold
from sklearn.impute import SimpleImputer, KNNImputer, MissingIndicator
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier

import time
import seaborn as sns
%matplotlib inline

### Datasets:

In [None]:
#air quality index 
aqi=pd.read_csv("../input/air-quality-data-in-india/city_hour.csv")

In [None]:
aqi.head()

In [None]:
delhitemp=pd.read_csv("../input/delhi-temperature/dataexport_20210608T105529.csv")

In [None]:
delhitemp[9:]

In [None]:
Dtmp=pd.DataFrame()
Dtmp['timestamp']=delhitemp['location'][9:]
Dtmp['maximum']=delhitemp['New Delhi'][9:]
Dtmp['minimum']=delhitemp['New Delhi.1'][9:]
Dtmp['mean']=delhitemp['New Delhi.2'][9:]

In [None]:
Dtmp.reset_index(inplace=True)

In [None]:

# air quality index from 2015-2020 in Delhi.
aqi=aqi[aqi.City =='Delhi']

In [None]:
aqi.dropna(how='all',inplace=True)

In [None]:
pd.DatetimeIndex(aqi['Datetime'])

In [None]:
#covid cases in districts
coviddist=pd.read_csv("../input/covid19-corona-virus-india-dataset/district_level_latest.csv")

In [None]:
coviddist.head()

In [None]:
coviddist=coviddist[coviddist.State=='Delhi']

In [None]:
coviddist

In [None]:
#covid cases in states daily
coviddaily=pd.read_csv("../input/covid19-corona-virus-india-dataset/state_level_daily.csv")

In [None]:
coviddaily.head()

In [None]:
coviddaily=coviddaily[coviddaily.State_Name=='Delhi'].dropna(how='any')

In [None]:
coviddaily.head()

In [None]:
#testing in states daily 
covidtests=pd.read_csv("../input/covid19-corona-virus-india-dataset/tests_state_wise.csv")

In [None]:
covidtests.head()

In [None]:
covidtests=covidtests[covidtests.State=='Delhi'].dropna(how='any')

In [None]:
stations=pd.read_csv("../input/air-quality-data-in-india/station_day.csv")

In [None]:
stations.head()

### Data Preprocessing:

In [None]:
#helper function to view the table content details
from IPython.display import display, HTML

def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )
def tbl_report(tbl, cols=None, card=52):
    print("Table Shape", tbl.shape)
    dtypes = tbl.dtypes
    nulls = []
    uniques = []
    numuniques = []
    vcs = []
    for col in dtypes.index:
        n = tbl[col].isnull().sum()
        nulls.append(n)
        strdtcol = str(dtypes[col])
        #if strdtcol == 'object' or strdtcol[0:3] == 'int' or strdtcol[0:3] == 'int':
        #print(strdtcol)
        uniqs = tbl[col].unique()
        uniquenums = uniqs.shape[0]
        if uniquenums < card: # low cardinality
            valcounts = pd.value_counts(tbl[col], dropna=False)
            vc = "\n".join(["{}:{}".format(k,v) for k, v in valcounts.items()])
        else:
            vc='NA'
        uniques.append(uniqs)
        numuniques.append(uniquenums)
        vcs.append(vc)
    nullseries = pd.Series(nulls, index=dtypes.index)
    uniqueseries = pd.Series(uniques, index=dtypes.index)
    numuniqueseries = pd.Series(numuniques, index=dtypes.index)
    vcseries = pd.Series(vcs, index=dtypes.index)
    df = pd.concat([dtypes, nullseries, uniqueseries, numuniqueseries, vcseries], axis=1)
    df.columns = ['dtype', 'nulls', 'uniques', 'num_uniques', 'value_counts']
    if cols:
        return pretty_print(df[cols])
    return pretty_print(df)

In [None]:
#clearing all nans and imputations are being carried out
#aqi.dropna(axis=0, how='all', thresh=None, subset=None, inplace=True)
numerical_features = [c for c, dtype in zip(aqi.columns, aqi.dtypes)
                 if dtype.kind in ['i','f']]
categorical_features = [c for c, dtype in zip(aqi.columns, aqi.dtypes)
                 if dtype.kind not in ['i','f']]
kimput=KNNImputer(n_neighbors=2, weights="uniform")
simput=SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')
aqi[numerical_features]=kimput.fit_transform(aqi[numerical_features])
aqi[categorical_features]=simput.fit_transform(aqi[categorical_features])
    
    

In [None]:
#datasets for air quality index of different years
aqi15=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2015-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2016-06-01 10:00:00')]
aqi16=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2016-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2017-06-01 10:00:00')]
aqi17=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2017-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2018-06-01 10:00:00')]
aqi18=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2018-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2019-06-01 10:00:00')]
aqi19=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2019-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2020-06-01 10:00:00')]
aqi20=aqi[pd.DatetimeIndex(aqi['Datetime']).year==2020]
#creating lists to divide dataframes as and according to requirements
df_list=[aqi15,aqi16,aqi17,aqi18,aqi19,aqi20]
df_list2=[aqi20,coviddist,coviddaily,covidtests]

In [None]:
tbl_report(aqi15)

In [None]:
tbl_report(aqi16)

In [None]:
tbl_report(aqi17)

In [None]:
tbl_report(aqi18)

In [None]:
tbl_report(aqi19)

In [None]:
aqi15.to_csv("./AQI15")
aqi16.to_csv("./AQI16")
aqi17.to_csv("./AQI17")
aqi18.to_csv("./AQI18")
aqi19.to_csv("./AQI19")
aqi20.to_csv("./AQI20")

## Air Quality Index Comparison :

In [None]:
aqi15.tail()

In [None]:
#Heatmap for 2015 Air Quality Index
fig,axes=plt.subplots(figsize=(15,10))
sns.heatmap(aqi19.corr(),annot=True,linewidths=.5,cmap="YlGnBu")
plt.title("Correlation matrix showing correlation of 0.8 between AQI and PM2.5")

In [None]:
aqi19['AQI']

The formula of aqi from pm2.5 is nonlinear.
Given in https://forum.airnowtech.org/t/the-aqi-equation/169

Where
Conci = Input concentration for a given pollutant
ConcLo = The concentration breakpoint that is less than or equal to Conci
ConcHi = The concentration breakpoint that is greater than or equal to Conci
AQILo = The AQI value/breakpoint corresponding to ConcLo
AQIHi = The AQI value/breakpoint corresponding to ConcHi


In [None]:
fig,axes=plt.subplots(figsize=(20,15))
sns.lineplot(data=aqi19,x='PM2.5',y='AQI',hue='AQI_Bucket',ax=axes)
plt.title("Relation between AQI and PM2.5 particles ")

In [None]:
#fig,axes=plt.subplots(figsize=(15,12))
sns.jointplot(data=aqi19,x='PM2.5',y='AQI',hue='AQI_Bucket')
plt.suptitle("Relation between AQI and PM2.5 particles ")

In [None]:
#fig,axes=plt.subplots(figsize=(15,12))
sns.catplot(data=aqi15,x='AQI_Bucket',y='AQI')
plt.title('AQI categories and their corresponding index values')
plt.savefig("./AQItypes")

In [None]:
months=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sept','Oct','Nov','Dec']

In [None]:
aqi['month']=[months[m-1] for m in pd.DatetimeIndex(aqi['Datetime']).month ]

In [None]:
#datasets for air quality index of different years
aqi15=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2015-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2016-06-01 10:00:00')]
aqi16=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2016-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2017-06-01 10:00:00')]
aqi17=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2017-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2018-06-01 10:00:00')]
aqi18=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2018-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2019-06-01 10:00:00')]
aqi19=aqi[(pd.DatetimeIndex(aqi['Datetime'])>'2019-06-01 10:00:00')&(pd.DatetimeIndex(aqi['Datetime'])<'2020-06-01 10:00:00')]
aqi20=aqi[pd.DatetimeIndex(aqi['Datetime']).year==2020]
#creating lists to divide dataframes as and according to requirements
df_list=[aqi15,aqi16,aqi17,aqi18,aqi19,aqi20]
df_list2=[aqi20,coviddist,coviddaily,covidtests]

In [None]:
np.max(aqi19['AQI'])

In [None]:
fig,axes=plt.subplots(figsize=(15,8))
sns.histplot(data=aqi19, x='month',y='AQI',ax=axes)
plt.title("AQI in Different months of 2019-2020")


In [None]:

fig,axes=plt.subplots(figsize=(12,8))
sns.lineplot(data=aqi15, x='month',y='AQI',ax=axes ,label='2015-16')
sns.lineplot(data=aqi16, x='month',y='AQI',ax=axes,label='2016-17')
sns.lineplot(data=aqi17, x='month',y='AQI',ax=axes,label='2017-18')
sns.lineplot(data=aqi18, x='month',y='AQI',ax=axes , label='2018-19')
sns.lineplot(data=aqi19, x='month',y='AQI',ax=axes ,label='2019-20')
plt.title("AQI in Different months of different years till 2019-20")
plt.legend()
plt.savefig("./AQI5years")

In [None]:
fig,axes=plt.subplots(figsize=(15,8))
#sns.kdeplot(data=aqi19, x=pd.DatetimeIndex(aqi19['Datetime']).month,y='AQI',ax=axes)
#plt.title("AQI in Different months of 2019")
sns.lineplot(data=aqi15, x='month',y='PM2.5',ax=axes ,label='2015-16')
sns.lineplot(data=aqi16, x='month',y='PM2.5',ax=axes ,label='2016-17')
sns.lineplot(data=aqi17, x='month',y='PM2.5',ax=axes ,label='2017-18')
sns.lineplot(data=aqi18, x='month',y='PM2.5',ax=axes ,label='2018-19')
sns.lineplot(data=aqi19, x='month',y='PM2.5',ax=axes ,label='2019-20')
plt.title("PM2.5 in Different months from 2015-2020")
plt.legend()
plt.savefig("./PM2years")

#### As evident, the air quality index is provided till the month of June for the year 2020. The AQI index more or less maintains a Linear relation with the PM2.5 particles and with the PM10 particles. There is a strong positive correlation between the two. The AQI was more or less low for the year 2020 due to the lockdown. We will try and derive a correlation between the Covid spread in 2020 and 2021 and the AQI of India during that time.


In [None]:
indirapuram=pd.read_csv("../input/stationdatasets20202021/indirapuram-ghaziabad india-air-quality.csv")

In [None]:
indirapuram.head()

In [None]:
indirapuram['city']=['indirapuram']*527

In [None]:
vasundhara=pd.read_csv("../input/stationdatasets2021/vasundhara-ghaziabad india-air-quality.csv")

In [None]:
vasundhara=vasundhara[:422]

In [None]:
vasundhara['city']=['vasundhara']*422

In [None]:

Xtest=indirapuram
Xtest2=vasundhara

In [None]:
Xtest.columns

We will predict the AQI for the indirapuram station based on the 2015 2016 2017 2018 and 2019 data of Delhi and check the corona cases in Indirapuram in 2021 and see they have any correlation with the AQI.
Indirapuram is under consideration as according to the https://www.hindustantimes.com/noida/indirapuram-vaishali-and-vasundhara-have-highest-covid-19-caseload-in-december/story-afoURjBdVr0xRLRXIWKRTJ.html , indirapuram had the highest number of  cases in 2020 December.

In [None]:
#implementing a RandomBoostRegressor to calculate AQI
df=pd.concat([aqi15,aqi16,aqi17,aqi18,aqi19])
df.head()

In [None]:
X=pd.DataFrame()
X['date']=df['Datetime']
X['pm25']=df['PM2.5']
X['pm10']=df['PM10']
X['o3']=df['O3']
X['no2']=df['NO2']
X['so2']=df['SO2']
X['co']=df['CO']

In [None]:
y=df['AQI']

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X.drop(['date'],axis=1),y,train_size=0.8,random_state=1)

In [None]:
xtrain.shape,ytrain.shape

In [None]:
xtrain.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(random_state=1)
params={'max_depth':range(3,9),'n_estimators':range(100,200,20),'max_features':[3,4,5,6],'bootstrap': [True],'criterion': ['mse']}
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = params, n_iter = 80, cv = 5, verbose=2, random_state=1, n_jobs = -1)
rf_random.fit(xtrain,ytrain)

In [None]:
bst=rf_random.best_estimator_

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(ytrain,bst.predict(xtrain))

In [None]:
r2_score(ytest,bst.predict(xtest))

In [None]:
param_grid = {'learning_rate': [0.1, 0.01],
              'max_depth': [3, 8],
              'min_samples_leaf': [3, 5],  ## depends on the num of training examples
              'max_features': [0.2, 0.6]
              }

In [None]:
from sklearn.model_selection import GridSearchCV
gb = GradientBoostingRegressor(n_estimators=600, loss='huber')
gb_cv = GridSearchCV(gb, param_grid, cv=3, n_jobs=-1)

In [None]:
gb_cv.fit(xtrain,ytrain)

In [None]:
ytrain.shape

In [None]:
bstg=gb_cv.best_estimator_



In [None]:
r2_score(ytrain,bstg.predict(xtrain))

In [None]:
r2_score(ytest,bstg.predict(xtest))

In [None]:
tbl_report(Xtest)

In [None]:
for c in Xtest.columns[1:-1]:
    Xtest[c]=[y.strip(" ") if y!=" " else np.nan for y in Xtest[c]]
    Xtest2[c]=[y.strip(" ") if y!=" " else np.nan for y in Xtest2[c]]

In [None]:
#using on Xtest
numerical_features = [c for c, dtype in zip(Xtest.columns, Xtest.dtypes)
                 if dtype.kind in ['i','f']]
categorical_features = [c for c, dtype in zip(Xtest.columns, Xtest.dtypes)
                 if dtype.kind not in ['i','f']]
simput=SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')
Xtest[categorical_features]=simput.fit_transform(Xtest[categorical_features])
for c in Xtest.columns[1:-1]:
    Xtest[c]=pd.to_numeric(Xtest[c])
AQI=bstg.predict(Xtest.drop(["date",'city'],axis=1))

In [None]:
numerical_features = [c for c, dtype in zip(Xtest.columns, Xtest2.dtypes)
                 if dtype.kind in ['i','f']]
categorical_features = [c for c, dtype in zip(Xtest.columns, Xtest2.dtypes)
                 if dtype.kind not in ['i','f']]
simput=SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')
Xtest2[categorical_features]=simput.fit_transform(Xtest2[categorical_features])
for c in Xtest2.columns[1:-1]:
    Xtest2[c]=pd.to_numeric(Xtest2[c])
AQI2=bstg.predict(Xtest2.drop(["date",'city'],axis=1))

In [None]:
tbl_report(Xtest)

In [None]:
AQI

In [None]:
Xtest['AQI']=AQI
Xtest2['AQI']=AQI2

In [None]:
Xtest.to_csv("./model_result")
Xtest2.to_csv("./model_result2")

In [None]:
Xtest=pd.read_csv("./model_result")
Xtest2=pd.read_csv("./model_result2")

In [None]:
Xtest.drop("Unnamed: 0",axis=1,inplace=True)
Xtest2.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
Xtest.sort_values(by="date",inplace=True)
Xtest2.sort_values(by="date",inplace=True)

In [None]:
Xtest['month']=[months[m-1]for m in pd.DatetimeIndex(Xtest['date']).month]

In [None]:
Xtest2['month']=[months[m-1]for m in pd.DatetimeIndex(Xtest2['date']).month]

In [None]:
fig,axes=plt.subplots(figsize=(15,8))
sns.lineplot(data=Xtest2, x='month',y='AQI',ax=axes)
plt.title("AQI in Different months of 2020-2021(June to June) in Indirapuram and Vasundhara")

In [None]:

fig,axes=plt.subplots(figsize=(15,8))
sns.lineplot(data=Xtest, x='month',y=' pm25',ax=axes,hue='city')
plt.title("PM2.5 in Different months of 06/2020-06/2021 in Indirapuram and Vasundhara")

In [None]:
Xtest=pd.concat([Xtest,Xtest2])

https://towardsdatascience.com/easy-steps-to-plot-geographic-data-on-a-map-python-11217859a2db

In [None]:
delhi=pd.read_csv("../input/delhi-weather-data/testset.csv")

In [None]:
delhi.head()

In [None]:
delhi['datetime_utc']=pd.DatetimeIndex(delhi['datetime_utc'])

In [None]:
delhi=delhi[pd.DatetimeIndex(delhi['datetime_utc']).year>2014]

In [None]:
pd.DatetimeIndex(Xtest['date'])

In [None]:
xtrain=pd.concat([aqi15,aqi16,aqi17])

In [None]:
xtrain.columns

In [None]:
xtrain['datetime_utc']=pd.DatetimeIndex(xtrain['Datetime'])
xtrain.drop(['Datetime','NO', 'NO2', 'NOx', 'NH3', 'CO',
       'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene'],axis=1)

In [None]:
xtrain.drop(['Datetime','NO', 'NOx', 'NH3', 'Benzene', 'Toluene', 'Xylene'],axis=1,inplace=True)

In [None]:
xtrain.set_index("datetime_utc",drop=True,inplace=True)

In [None]:
xtrain

In [None]:
delhi.set_index("datetime_utc",drop=True,inplace=True)

In [None]:
x=xtrain.merge(delhi,left_index=True,right_index=True,how='inner')

In [None]:
tbl_report(x)

In [None]:
x.columns

In [None]:
x.drop([' _dewptm', ' _fog', ' _hail',
       ' _pressurem', ' _rain', ' _snow',' _pressurem', ' _rain', ' _thunder', ' _tornado',
       ' _vism', ' _wdird', ' _wdire', ' _wgustm', ' _windchillm', ' _wspdm'],inplace=True,axis=1)

In [None]:
x.drop([' _heatindexm',' _precipm'],axis=1,inplace=True)

In [None]:
fig,axes=plt.subplots(figsize=(10,10))
sns.heatmap(x.corr(),annot=True,linewidths=.5,cmap='ocean')
plt.title("Correlation matrix showing correlation between PM2.5 or AQI with humidity and temperature")
plt.savefig("./correlationmatrix")

In [None]:
numerical_features = [c for c, dtype in zip(x.columns, x.dtypes)
                 if dtype.kind in ['i','f']]
categorical_features = [c for c, dtype in zip(x.columns, x.dtypes)
                 if dtype.kind not in ['i','f']]
kimput=KNNImputer(n_neighbors=2, weights="uniform")
simput=SimpleImputer(strategy = 'most_frequent', fill_value = 'missing')
x[numerical_features]=kimput.fit_transform(x[numerical_features])
x[categorical_features]=simput.fit_transform(x[categorical_features])

In [None]:
tbl_report(x)

In [None]:
lb=LabelEncoder()
x[' _conds']=lb.fit_transform(x[' _conds'])

In [None]:
x[' _conds']

In [None]:
lb.classes_

In [None]:
covidtests=pd.read_csv("../input/covid19-corona-virus-india-dataset/tests_state_wise.csv")

In [None]:
covidtests=covidtests[covidtests['State']=='Delhi']

In [None]:
pd.DatetimeIndex(covidtests['Updated On'])

In [None]:
x

In [None]:
x.columns

In [None]:
Xtrain=x.drop(['AQI_Bucket','month','City',' _tempm',' _conds',' _hum'],axis=1)
ytrain=x[[' _conds', ' _hum', ' _tempm']]

In [None]:
Xtrain.shape,ytrain.shape

In [None]:
Xtest.shape,ytest.shape

In [None]:
tbl_report(ytrain)

In [None]:
gb = GradientBoostingRegressor(n_estimators=600, loss='huber')
gb_cv = GridSearchCV(gb, param_grid, cv=3, n_jobs=-1)

In [None]:
gb_cv.fit(Xtrain, ytrain[' _tempm'])

In [None]:
Xtest

In [None]:
bstg=gb_cv.best_estimator_
r2_score(ytrain[' _tempm'],bstg.predict(Xtrain))
#Xtest=pd.concat([indirapuram,vasundhara])

In [None]:
Xtrain

In [None]:
gb_cv.fit(Xtrain, ytrain[' _hum'])

In [None]:
bstg=gb_cv.best_estimator_
r2_score(ytrain[' _hum'],bstg.predict(Xtrain))
#Xtest=pd.concat([indirapuram,vasundhara])

In [None]:
gb_cv.fit(Xtrain, ytrain[' _conds'])
bstg=gb_cv.best_estimator_
r2_score(ytrain[' _conds'],bstg.predict(Xtrain))
#Xtest=pd.concat([indirapuram,vasundhara])

In [None]:
covidtests.head()

In [None]:
X=Xtrain.merge(ytrain,left_index=True,right_index=True,how="inner")

In [None]:
X.columns

In [None]:
fig,axes=plt.subplots(figsize=(15,8))
sns.histplot(x=X[X['PM2.5']>300]['PM2.5'],hue=X[' _conds'],ax=axes)
plt.title("Weather conditions which are associated with high PM2.5 concentration in outdoor air")
plt.savefig("./weatherimage")

In [None]:
#fig,axes=plt.subplots(figsize=(15,8))
sns.lineplot(x=X[X['PM2.5']>300]['PM2.5'],y=X[' _tempm'])

In [None]:
covidtests

In [None]:
covidtests.columns

In [None]:
covidtests['month']=[months[y-1] for y in pd.DatetimeIndex(covidtests['Updated On'],dayfirst=True).month]

In [None]:
#Xtest=Xtest[Xtest.columns[1:]]

In [None]:
#Xtest['date']=pd.DatetimeIndex(Xtest.date)

In [None]:
cf=pd.read_csv("../input/coviddelhi/airfoil.csv")
cf.head()

In [None]:
cf['date']=pd.DatetimeIndex(cf['date'])

In [None]:
cf.head()

In [None]:
X.reset_index(inplace=True)

In [None]:
X.rename(columns={'datetime_utc':'date'},inplace=True)

In [None]:
Xtest.columns

In [None]:
df=pd.DataFrame()

In [None]:
X

In [None]:
df['date']=pd.DatetimeIndex(X['date']).date
df['pm25']=X['PM2.5']
df['pm10']=X['PM10']
df['o3']=X['O3']
df['no2']=X['NO2']
df['so2']=X['SO2']
df['co']=X['CO']
df['AQI']=X['AQI']

In [None]:
df

In [None]:
cf[:-1]

In [None]:
Xtest.date=pd.DatetimeIndex(Xtest['date'])

In [None]:
len(pd.DatetimeIndex(X['date']).date)

In [None]:
final=Xtest.merge(cf,on='date',how='inner')

In [None]:
final[final.month.isin(['Nov'])]

In [None]:
fig,axes=plt.subplots(figsize=(10,10))
sns.heatmap(final.corr(),annot=True,ax=axes,cmap="ocean")
plt.title("Correlation between PM2.5 and Covid Cases due to External Factors")
plt.savefig("./Covidcorr")

In [None]:
tbl_report(final)

In [None]:
Xtest['date'][1]

In [None]:
pd.DatetimeIndex(Xtest['date']).month[948]

In [None]:
sum=np.zeros((12,))
for i in range(final.shape[0]):
    sum[pd.DatetimeIndex(final['date']).month[i]-1]=sum[pd.DatetimeIndex(final['date']).month[i]-1]+ final['cases'][i]
    
    

In [None]:
final['cases']=final['cases'].astype("int64")

In [None]:
sns.lineplot(final[final.month.isin(['Aug','Sept','Oct','Nov','Dec','Jan','Feb'])]['date'],final['cases'])

https://indianexpress.com/article/explained/new-delhi-coronavirus-covid-19-cases-explained-6991070/