### Get Population Data for COVID-19 API

In [1]:
import io
import json
import pandas as pd
import requests

In [2]:
# Get all COVID-19 API data
url = 'https://api.covid19api.com/all'
response = requests.get(url)
data = json.loads(response.content.decode('utf-8'))
covid_api_df = pd.DataFrame(data)    

In [3]:
# Get all CoronaDataScraper data (has populations)
response = requests.get('https://coronadatascraper.com/data.csv')
data = io.StringIO(response.content.decode('utf-8'))
corona_scraper_df = pd.read_csv(data)

In [4]:
# Get country codes (2 and 3 letter) for converting between datasets
response = requests.get('https://pkgstore.datahub.io/core/country-codes/country-codes_csv/data/3b9fd39bdadd7edd7f7dcee708f47e1b/country-codes_csv.csv')
data = io.StringIO(response.content.decode('utf-8'))
country_codes = pd.read_csv(data).set_index('ISO3166-1-Alpha-3')['ISO3166-1-Alpha-2']

In [5]:
# Get unique regions and their populations in CoronaDataScraper data
csdf = corona_scraper_df[['city', 'county', 'state', 'country', 'population']].drop_duplicates()
# Fix USA and convert country codes from 3 letter to 2 letter
csdf['country'] = csdf['country'].replace('United States', 'USA').replace('iso1:US', 'USA').apply(lambda x:country_codes.get(x, x))
# Create a new column summarizing the region; fix some inconsistencies in naming of regions
csdf['name'] = csdf.astype(str).apply(lambda x: ', '.join(x[:4]), axis=1).str.replace('nan, ', '').str.replace(' County', '').str.replace(' Parish', '').str.replace(' County', '').str.replace(' City', '')
csdf.head()

Unnamed: 0,city,county,state,country,population,name
0,,,,AF,34124811.0,AF
1,,,,AL,3047987.0,AL
2,,,,DZ,40969443.0,DZ
3,,,,AD,85702.0,AD
4,,,,AO,29310273.0,AO


In [6]:
# Get unique regions in COVID-19 API data
cadf = covid_api_df[['LocationID', 'City', 'Province', 'CountryCode']].drop_duplicates()
# Create a new column summarizing the region
cadf['name'] = cadf.astype(str).apply(lambda x: ', '.join(x[1:4]), axis=1).str.replace('nan, ', '')
cadf.head()

Unnamed: 0,LocationID,City,Province,CountryCode,name
0,828ca7f3-144f-4732-b659-a60f97755626,,,AF,AF
74,07ef547a-c2a5-4279-8f24-0295e9c7fe7a,,,AL,AL
148,e6bec38b-4eae-4ce4-bbaf-69f532c1bd5f,,,DZ,DZ
222,9f52c049-2008-45d8-8f86-7c009e015d17,,,AD,AD
296,8bba7786-aabb-423a-97a1-7140fa6dfe19,,,AO,AO


In [7]:
# Summarize the overlap between region names; there will be some mismatches (could be fixed later)
a = set(cadf['name'])
s = set(csdf['name'])
print('Overlap=%d; Unique to S=%d; Unique to A=%d' % (len(a.intersection(s)), len(a.difference(s)), len(set(s).difference(a))))

Overlap=3055; Unique to S=452; Unique to A=563


In [8]:
# Join the data frames
joined = cadf.join(csdf.set_index('name'), on='name', how='inner')
joined.head()

Unnamed: 0,LocationID,City,Province,CountryCode,name,city,county,state,country,population
0,828ca7f3-144f-4732-b659-a60f97755626,,,AF,AF,,,,AF,34124811.0
74,07ef547a-c2a5-4279-8f24-0295e9c7fe7a,,,AL,AL,,,,AL,3047987.0
148,e6bec38b-4eae-4ce4-bbaf-69f532c1bd5f,,,DZ,DZ,,,,DZ,40969443.0
222,9f52c049-2008-45d8-8f86-7c009e015d17,,,AD,AD,,,,AD,85702.0
296,8bba7786-aabb-423a-97a1-7140fa6dfe19,,,AO,AO,,,,AO,29310273.0


In [9]:
# Generate a dataframe which just provides populations for most COVID-19 API LocationIDs
result = joined[['LocationID', 'population']].set_index('LocationID')
result.to_csv('population-by-location.csv')