In [1]:
# Data manipulation
import pandas as pd
import numpy as np
import requests
from pprint import pprint
import datetime as dt

from sqlalchemy import create_engine


In [2]:
# Import CSV
q1_2019_csv = "waqi-covid19-airqualitydata-2019Q1_modified.csv"
q1_2019_pd = pd.read_csv(q1_2019_csv, low_memory=False)
q1_2019_pd.head()

Unnamed: 0,Date,Country,City,Specie,median
0,2/18/2019,PH,Baguio,co,0.2
1,2/19/2019,PH,Baguio,co,0.2
2,2/13/2019,PH,Baguio,co,0.2
3,2/14/2019,PH,Baguio,co,0.2
4,2/15/2019,PH,Baguio,co,0.2


In [3]:
q2_2019_csv = "waqi-covid19-airqualitydata-2019Q2_modified.csv"
q2_2019_pd = pd.read_csv(q2_2019_csv, low_memory=False)
q2_2019_pd.head()

Unnamed: 0,Date,Country,City,Specie,median
0,4/25/2019,RS,Novi Sad,pm25,78.0
1,5/6/2019,RS,Novi Sad,pm25,29.0
2,5/13/2019,RS,Novi Sad,pm25,53.0
3,2/26/2019,RS,Novi Sad,pm25,88.0
4,3/2/2019,RS,Novi Sad,pm25,79.0


In [4]:
q1_2020_csv = "waqi-covid19-airqualitydata-2020_modified.csv"
q1_2020_pd = pd.read_csv(q1_2020_csv, low_memory=False)
q1_2020_pd.head()

Unnamed: 0,Date,Country,City,Specie,median
0,1/7/2020,CA,Hamilton,wind speed,1.0
1,1/31/2020,CA,Hamilton,wind speed,0.1
2,1/5/2020,CA,Hamilton,wind speed,0.7
3,1/21/2020,CA,Hamilton,wind speed,1.2
4,1/23/2020,CA,Hamilton,wind speed,0.1


In [5]:
filtered_pollutants_Q1_2020 = q1_2020_pd.loc[q1_2020_pd['Specie'].isin(['co', 'no2', 'o3', 'pm25', 'pm10', 'so2'])]
filtered_pollutants_Q1_2020

Unnamed: 0,Date,Country,City,Specie,median
164,2/29/2020,CA,Hamilton,co,1.9
165,3/3/2020,CA,Hamilton,co,3.9
166,1/5/2020,CA,Hamilton,co,1.7
167,3/9/2020,CA,Hamilton,co,2.3
168,3/7/2020,CA,Hamilton,co,1.9
...,...,...,...,...,...
595366,1/10/2020,TJ,Dushanbe,pm25,167.0
595367,1/21/2020,TJ,Dushanbe,pm25,167.0
595368,1/31/2020,TJ,Dushanbe,pm25,112.0
595369,2/9/2020,TJ,Dushanbe,pm25,74.0


In [6]:
filtered_pollutants_Q1_2019 = q1_2019_pd.loc[q1_2019_pd['Specie'].isin(['co', 'no2', 'o3', 'pm25', 'pm10', 'so2'])]
filtered_pollutants_Q1_2019

Unnamed: 0,Date,Country,City,Specie,median
0,2/18/2019,PH,Baguio,co,0.2
1,2/19/2019,PH,Baguio,co,0.2
2,2/13/2019,PH,Baguio,co,0.2
3,2/14/2019,PH,Baguio,co,0.2
4,2/15/2019,PH,Baguio,co,0.2
...,...,...,...,...,...
331522,1/1/2019,AT,Graz,co,0.1
331523,1/29/2019,AT,Graz,co,0.1
331524,2/14/2019,AT,Graz,co,0.1
331525,2/21/2019,AT,Graz,co,0.1


In [7]:
filtered_pollutants_Q2_2019 = q2_2019_pd.loc[q2_2019_pd['Specie'].isin(['co', 'no2', 'o3', 'pm25', 'pm10', 'so2'])]
filtered_pollutants_Q2_2019

Unnamed: 0,Date,Country,City,Specie,median
0,4/25/2019,RS,Novi Sad,pm25,78.0
1,5/6/2019,RS,Novi Sad,pm25,29.0
2,5/13/2019,RS,Novi Sad,pm25,53.0
3,2/26/2019,RS,Novi Sad,pm25,88.0
4,3/2/2019,RS,Novi Sad,pm25,79.0
...,...,...,...,...,...
557203,3/26/2019,IE,Dublin,o3,23.4
557204,5/6/2019,IE,Dublin,o3,23.3
557205,5/14/2019,IE,Dublin,o3,33.1
557206,3/31/2019,IE,Dublin,o3,22.1


In [8]:
filtered_cities_2020 = filtered_pollutants_Q1_2020.loc[filtered_pollutants_Q1_2020['City'].isin(['Amsterdam', 'Bangkok', 'Beijing', 'Belgrade', 'Bilboa', 'Budapest', 'Busan', 'Delhi', 'Haarlem', 'Hong Kong', 'Kyoto', 'London', 'Mumbai', 'Osaka', 'Santiago', 'Seoul', 'Shanghai', 'Taipei', 'Tokyo', 'WrocÅ‚aw'])]
filtered_cities_2020

Unnamed: 0,Date,Country,City,Specie,median
13515,1/11/2020,CA,London,pm25,21.0
13516,2/1/2020,CA,London,pm25,55.0
13517,3/4/2020,CA,London,pm25,25.0
13518,3/15/2020,CA,London,pm25,17.0
13519,4/9/2020,CA,London,pm25,13.0
...,...,...,...,...,...
595079,4/9/2020,RS,Belgrade,o3,29.5
595080,3/27/2020,RS,Belgrade,o3,23.1
595081,1/27/2020,RS,Belgrade,o3,1.3
595082,3/16/2020,RS,Belgrade,o3,23.6


In [9]:
filtered_cities_Q1_2019 = filtered_pollutants_Q1_2019.loc[filtered_pollutants_Q1_2019['City'].isin(['Amsterdam', 'Bangkok', 'Beijing', 'Belgrade', 'Bilboa', 'Budapest', 'Busan', 'Delhi', 'Haarlem', 'Hong Kong', 'Kyoto', 'London', 'Mumbai', 'Osaka', 'Santiago', 'Seoul', 'Shanghai', 'Taipei', 'Tokyo', 'WrocÅ‚aw'])]
filtered_cities_Q1_2019

Unnamed: 0,Date,Country,City,Specie,median
22034,2/21/2019,NL,Haarlem,o3,14.0
22035,1/11/2019,NL,Haarlem,o3,27.2
22036,2/4/2019,NL,Haarlem,o3,13.7
22037,1/21/2019,NL,Haarlem,o3,0.6
22038,2/1/2019,NL,Haarlem,o3,8.2
...,...,...,...,...,...
328118,1/3/2019,HK,Hong Kong,o3,4.2
328119,1/11/2019,HK,Hong Kong,o3,9.6
328120,2/12/2019,HK,Hong Kong,o3,13.5
328121,1/21/2019,HK,Hong Kong,o3,14.8


In [10]:
filtered_cities_Q2_2019 = filtered_pollutants_Q2_2019.loc[filtered_pollutants_Q2_2019['City'].isin(['Amsterdam', 'Bangkok', 'Beijing', 'Belgrade', 'Bilboa', 'Budapest', 'Busan', 'Delhi', 'Haarlem', 'Hong Kong', 'Kyoto', 'London', 'Mumbai', 'Osaka', 'Santiago', 'Seoul', 'Shanghai', 'Taipei', 'Tokyo', 'WrocÅ‚aw'])]
filtered_cities_Q2_2019

Unnamed: 0,Date,Country,City,Specie,median
1901,3/16/2019,RS,Belgrade,so2,5.8
1902,3/18/2019,RS,Belgrade,so2,4.3
1903,5/14/2019,RS,Belgrade,so2,3.2
1904,3/11/2019,RS,Belgrade,so2,3.9
1905,4/21/2019,RS,Belgrade,so2,6.1
...,...,...,...,...,...
509007,3/12/2019,TW,Taipei,no2,15.8
509008,4/21/2019,TW,Taipei,no2,17.6
509009,5/1/2019,TW,Taipei,no2,12.1
509010,5/14/2019,TW,Taipei,no2,15.8


In [11]:
filtered_2019 = filtered_cities_Q1_2019.append(filtered_cities_Q2_2019)
filtered_2019

Unnamed: 0,Date,Country,City,Specie,median
22034,2/21/2019,NL,Haarlem,o3,14.0
22035,1/11/2019,NL,Haarlem,o3,27.2
22036,2/4/2019,NL,Haarlem,o3,13.7
22037,1/21/2019,NL,Haarlem,o3,0.6
22038,2/1/2019,NL,Haarlem,o3,8.2
...,...,...,...,...,...
509007,3/12/2019,TW,Taipei,no2,15.8
509008,4/21/2019,TW,Taipei,no2,17.6
509009,5/1/2019,TW,Taipei,no2,12.1
509010,5/14/2019,TW,Taipei,no2,15.8


In [12]:
pivot_data_2020 = pd.pivot_table(filtered_cities_2020, columns=['Specie'], index=['Date', 'City', 'Country'])
#pivot_data_2020 = pivot_data_2020.dropna()
pivot_data_2020

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,median,median,median,median,median,median
Unnamed: 0_level_1,Unnamed: 1_level_1,Specie,co,no2,o3,pm10,pm25,so2
Date,City,Country,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1/1/2020,Amsterdam,NL,3.6,13.9,5.1,51.0,138.0,0.2
1/1/2020,Bangkok,TH,0.1,7.5,7.5,33.0,68.0,1.1
1/1/2020,Beijing,CN,8.2,25.2,1.7,52.0,104.0,4.1
1/1/2020,Belgrade,RS,7.8,12.1,5.0,49.0,128.0,3.9
1/1/2020,Budapest,HU,4.6,7.1,16.7,19.0,58.0,1.6
...,...,...,...,...,...,...,...,...
4/9/2020,Santiago,CL,5.2,12.4,16.9,67.0,80.0,11.7
4/9/2020,Seoul,KR,4.5,17.6,28.9,45.0,72.0,4.3
4/9/2020,Shanghai,CN,4.6,18.3,31.3,51.0,97.0,3.6
4/9/2020,Taipei,TW,5.0,12.6,37.0,26.0,57.0,2.2


In [None]:
pivot_data_2020.columns = pivot_data_2020.columns.to_series().str.join('_')
flat_2020_df = pivot_data_2020.reset_index()
flat_2020_df

In [None]:
pivot_data_2019 = pd.pivot_table(filtered_2019, columns=['Specie'], index=['Date', 'City', 'Country'])
pivot_data_2019 = pivot_data_2019.dropna()
pivot_data_2019

In [None]:
#Flatten the Datatable
pivot_data_2019.columns = pivot_data_2019.columns.to_series().str.join('_')
flat_2019_df = pivot_data_2019.reset_index()
flat_2019_df

In [None]:
flat_2019_df.info()

In [None]:
merged_data = flat_2019_df.append(flat_2020_df)

In [None]:
merged_data 

## Create tables from dataframe to feed DB

In [None]:
#Make dates df with just index as dateid and date
dates_df = merged_data['Date']
#dates_df = dates_df.sort_values()
dates_df = dates_df.drop_duplicates().reset_index()
dates_df = dates_df.rename(columns={"index": "index1"})
dates_df = dates_df.reset_index()
dates_df = dates_df.rename(columns={"index": "dateid"})
dates_df = dates_df.drop(["index1"], axis=1)
dates_df

In [None]:
#Make a cities df with just index as cityid and cityname
cities_df = merged_data['City']
cities_df = cities_df.sort_values()
cities_df = cities_df.drop_duplicates().reset_index()
cities_df = cities_df.rename(columns={"index": "index1"})
cities_df = cities_df.reset_index()
cities_df = cities_df.rename(columns={"index": "citiyid"})
cities_df = cities_df.drop(["index1"], axis=1)
cities_df

In [None]:
#Make countries df with index as countryid and countryabvr
countries_df = merged_data['Country']
countries_df = countries_df.sort_values()
countries_df = countries_df.drop_duplicates().reset_index()
countries_df = countries_df.rename(columns={"index": "index1"})
countries_df = countries_df.reset_index()
countries_df = countries_df.rename(columns={"index": "countryid"})
countries_df = countries_df.drop(["index1"], axis=1)
countries_df

In [None]:
#Make a table for measurements that has an id for date and city and the recorded measurements observed.
measuremetns_df = []
measurements_df = pd.merge(merged_data,dates_df,on="Date", how="left")
measurements_df = pd.merge(measurements_df,cities_df,on="City", how="left")
measurements_df = pd.merge(measurements_df,countries_df,on="Country", how="left")
measurements_df = measurements_df.drop(["Date", "City", "Country"], axis=1)
measurements_df

In [None]:
# Cleanup to match ERD
dates_df = dates_df.rename(columns={"Date": "date"})
cities_df = cities_df.rename(columns={"City": "cityname"})
countries_df = countries_df.rename(columns={"Country": "countryabvr"})

# Make sure pgAdmin is running for this next step

### Create a DB in PGAdmin called covid_polluntants_db and run the code in the Proj2Grp3-Covid_Polluntants_DB.sql file in the query tool to ensure the db is setup correctly

In [None]:
#Make connection to DB and use the appropriate credentials for your local postgres DB
rds_connection_string = "postgres:postGRES12@localhost:5432/covid_polluntants_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# Code to shove data from into DB 
measurements_df.to_sql(name='measurements', con=engine, if_exists='replace', index = False)
dates_df.to_sql(name='dates', con=engine, if_exists='replace', index = False)
cities_df.to_sql(name='cities', con=engine, if_exists='replace', index = False)
countries_df.to_sql(name='countries', con=engine, if_exists='replace', index = False)


## Test that the data went into the DB correctly

In [None]:
pd.read_sql_query('select * from dates order by date desc', con=engine).head()

In [None]:
pd.read_sql_query('select * from cities', con=engine).head()

In [None]:
pd.read_sql_query('select * from countries', con=engine).head()

In [None]:
pd.read_sql_query('select * from measurements', con=engine).head()

# Pull in new data

In [None]:
city = ('Amsterdam','Bangkok','Beijing','Belgrade','Budapest','Busan','Delhi','Haarlem','Hong Kong','Kyoto', 'London', 'Mumbai', 'Osaka', 'Santiago', 'Seoul','Shanghai','Taipei','Tokyo')
token = ('6a9ae5f6bddb0e60357a9117c6ead1a524f072d8')


In [None]:
# Define base url

url_city = "https://api.waqi.info/feed/"
print(url_city)

In [None]:
for x in city:
    response = f"{url_city}{x}/{'?token='}{token}"
    print(response)

In [None]:
cities = []
aqi = []
geo = []
url = []
time = []
co = []
no2 = []
o3 = []
pm10 = []
pm25 = []
so2 = []
polls=['co','no2','o3','pm10','pm25','so2']

for x in city:
    response = requests.get(f"{url_city}{x}/{'?token='}{token}").json()
    pprint(response)
    print("----------------------------")                      
    print(x)                        
    try:
        cities.append(x)                    
        aqi.append(response['data']['aqi'])                    
        geo.append(response['data']['city']['geo'])
        url.append(response['data']['city']['url'])
        time.append(response['data']['time']['s'])
        
        keys=response['data']['iaqi'].keys()
        for poll in polls:
            if poll in keys:
                eval(poll).append(response['data']['iaqi'][poll]['v'])
            else:
                eval(poll).append(np.nan)
                            
    except KeyError:
        cities.append(np.nan)
        aqi.append(np.nan)
        geo.append(np.nan)                   
        url.append(np.nan)
        time.append(np.nan)

In [None]:
airquality_df = pd.DataFrame({"CITY": cities,
                              "AQI": aqi,
                              "GEO": geo,
                              "URL": url,
                              "TIME": time,                            
                              "CO": co,
                              "NO2": no2,                         
                              "O3": o3,                              
                              "PM10": pm10,
                              "PM25": pm25,
                              "SO2": so2
                            })
airquality_df.info()

In [None]:
airquality_df.head(18)

In [None]:
pd.to_datetime(airquality_df['TIME']).dt.date