# Population Densities

Here we scrape some population densities for the countries/ states in the train set

### Sources

World (countries with no states)
https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density

USA
https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density

Australia
https://en.wikipedia.org/wiki/States_and_territories_of_Australia

Canada
https://en.wikipedia.org/wiki/Population_of_Canada_by_province_and_territory

China
https://en.wikipedia.org/wiki/Provinces_of_China

France
https://en.wikipedia.org/wiki/Overseas_France


In [561]:
import requests, pandas as pd, numpy as np
from bs4 import BeautifulSoup

In [562]:
train=pd.read_csv('../covid19/train.csv')
country_state = train[['Province_State', 'Country_Region']].drop_duplicates().reset_index(drop = True)
del train
country_state.head()

Unnamed: 0,Province_State,Country_Region
0,,Afghanistan
1,,Albania
2,,Algeria
3,,Andorra
4,,Angola


## Countries with states/ provinces

In [563]:
country_state[~country_state.Province_State.isna()].Country_Region.drop_duplicates()

8           Australia
36             Canada
49              China
91            Denmark
108            France
173       Netherlands
224                US
281    United Kingdom
Name: Country_Region, dtype: object

## Scrape countries without states/ provinces first

In [564]:
countries = list(country_state.Country_Region.drop_duplicates())

In [565]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population_density'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser').findAll('td')

In [566]:
cols = ['country', 'state', 'density'] # people / km2
df_WD = pd.DataFrame(columns = cols)

In [567]:
ix = 0
for ii in range(len(soup)):
    try:
        country = soup[ii].findAll('a')[0].text
    except:
        country = 'none'
    
    if country in countries:
        df_WD.loc[ix, 'country'] = country
        df_WD.loc[ix, 'density'] = soup[ii + 4].text[:-1]
        ix += 1

del soup
df_WD = df_WD.reset_index(drop = True)


In [568]:
df_WD.density = df_WD.density.str.replace(",","").astype(float)
df_WD.head()

Unnamed: 0,country,state,density
0,Monaco,,18960.0
1,Singapore,,7894.0
2,Bahrain,,1983.0
3,Malta,,1510.0
4,Maldives,,1258.0


## Scrape USA

In [569]:
states = list(country_state[country_state.Country_Region=='US'].Province_State)

In [570]:
url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States_by_population_density'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser').findAll('td')

In [571]:
cols = ['country', 'state', 'density'] # people / km2
df_US = pd.DataFrame(columns = cols)

In [572]:
ix = 0
for ii in range(len(soup)):
    try:
        state = soup[ii].findAll('a')[0].text
    except:
        state = 'none'
    
    if state in states:
        df_US.loc[ix, 'state'] = state
        df_US.loc[ix, 'density'] = soup[ii + 4].text[:-1]
        ix += 1

del soup
df_US.country = 'US'
df_US = df_US.reset_index(drop = True)


In [573]:
df_US = df_US.iloc[0:53]
df_US.density = df_US.density.astype(float)
df_US.head()

Unnamed: 0,country,state,density
0,US,District of Columbia,4251.0
1,US,New Jersey,470.0
2,US,Puerto Rico,404.0
3,US,Rhode Island,394.0
4,US,Massachusetts,336.0


## Scrape Australia

In [574]:
states = list(country_state[country_state.Country_Region=='Australia'].Province_State)

In [575]:
url = 'https://en.wikipedia.org/wiki/States_and_territories_of_Australia'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser').findAll('td')

In [576]:
cols = ['country', 'state', 'population', 'area']
df_AU = pd.DataFrame(columns = cols)

In [577]:
ix = 0
for ii in range(len(soup)):
    try:
        state = soup[ii].text
    except:
        state = 'none'
    
    if state in states:
        shift = (state in ['Australian Capital Territory', 'Northern Territory'])
        df_AU.loc[ix, 'state'] = state
        df_AU.loc[ix, 'population'] = soup[ii + 4 + shift].text
        df_AU.loc[ix, 'area'] = soup[ii + 5 + shift].text[:-1]
        ix += 1

del soup
df_AU.country = 'Australia'
df_AU = df_AU.reset_index(drop = True)

In [578]:
df_AU.population = df_AU.population.str.replace(",","").astype(float)
df_AU.area = df_AU.area.str.replace(",","").astype(float)
df_AU['density'] = df_AU.population / df_AU.area
df_AU = df_AU.drop(columns = ['population', 'area'])

In [579]:
df_AU.head()

Unnamed: 0,country,state,density
0,Australia,New South Wales,9.987661
1,Australia,Queensland,2.751526
2,Australia,South Australia,1.6773
3,Australia,Tasmania,5.886875
4,Australia,Victoria,27.749252


## Scrape Canada

In [580]:
states = list(country_state[country_state.Country_Region=='Canada'].Province_State)

In [581]:
url = 'https://en.wikipedia.org/wiki/Population_of_Canada_by_province_and_territory'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser').findAll('td')

In [582]:
cols = ['country', 'state', 'density']
df_CA = pd.DataFrame(columns = cols)

In [583]:
ix = 0
for ii in range(len(soup)):
    try:
        state = soup[ii].findAll('a')[0].text
    except:
        state = 'none'
    
    if state in states:
        df_CA.loc[ix, 'state'] = state
        df_CA.loc[ix, 'density'] = soup[ii + 5].text[:-1]
        ix += 1

del soup
df_CA.country = 'Canada'
df_CA = df_CA.reset_index(drop = True)

In [584]:
df_CA = df_CA.iloc[0:10]
df_CA.density = df_CA.density.str.replace(",","").astype(float)
df_CA.head()

Unnamed: 0,country,state,density
0,Canada,Ontario,14.8
1,Canada,Quebec,6.0
2,Canada,British Columbia,5.0
3,Canada,Alberta,6.4
4,Canada,Manitoba,2.3


## Scrape China

In [585]:
states = list(country_state[country_state.Country_Region=='China'].Province_State)

In [586]:
url = 'https://en.wikipedia.org/wiki/Provinces_of_China'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser').findAll('td')

In [587]:
cols = ['country', 'state', 'density']
df_CH = pd.DataFrame(columns = cols)

In [588]:
suffix = ['Province', 'Municipality', 'Autonomous Region', 'Administrative Region']

ix = 0

for ii in range(len(soup)):
    try:
        state = soup[ii].findAll('a')[0].text
        if any(x in state for x in suffix):
            if 'Hong Kong' in state:
                state = 'Hong Kong'
            elif 'Inner Mongolia' in state:
                state = 'Inner Mongolia'
            else:
                state = state[0:state.index(' ')]
    except:
        state = 'none'
    
    if state in states:
        df_CH.loc[ix, 'state'] = state
        df_CH.loc[ix, 'density'] = soup[ii + 4].text[:-1]
        ix += 1

del soup
df_CH.country = 'China'
df_CH = df_CH.reset_index(drop = True)

In [589]:
df_CH = df_CH.iloc[2:35]
df_CH.density = df_CH.density.str.replace(",","").astype(float)
df_CH.head()

Unnamed: 0,country,state,density
2,China,Anhui,425.91
3,China,Beijing,1167.4
4,China,Chongqing,350.5
5,China,Fujian,304.15
6,China,Guangdong,579.46


## Scrape France

In [590]:
states = list(country_state[country_state.Country_Region=='France'].Province_State)

In [591]:
url = 'https://en.wikipedia.org/wiki/Overseas_France'
html_text = requests.get(url).text
soup = BeautifulSoup(html_text, 'html.parser').findAll('td')

In [592]:
cols = ['country', 'state', 'density']
df_FR = pd.DataFrame(columns = cols)

In [593]:
ix = 0
for ii in range(len(soup)):
    try:
        state = soup[ii].findAll('a')[0].text
    except:
        state = 'none'
    
    if state in states:
        df_FR.loc[ix, 'state'] = state
        df_FR.loc[ix, 'density'] = soup[ii + 4].text[:-1]
        ix += 1

del soup
df_FR.country = 'France'
df_FR = df_FR.reset_index(drop = True)

In [594]:
df_FR = df_FR.iloc[0:6]
df_FR.density = df_FR.density.str.replace(",","").astype(float)
df_FR

Unnamed: 0,country,state,density
0,France,French Guiana,3.0
1,France,French Polynesia,78.0
2,France,Guadeloupe,240.0
3,France,Martinique,329.0
4,France,Mayotte,693.0
5,France,New Caledonia,15.0


## Combine

In [595]:
df = df_WD.append(df_US).append(df_AU).append(df_CA).append(df_CH).append(df_FR)

In [596]:
df = df.reset_index(drop = True)
df

Unnamed: 0,country,state,density
0,Monaco,,18960.0
1,Singapore,,7894.0
2,Bahrain,,1983.0
3,Malta,,1510.0
4,Maldives,,1258.0
...,...,...,...
355,France,French Polynesia,78.0
356,France,Guadeloupe,240.0
357,France,Martinique,329.0
358,France,Mayotte,693.0


In [597]:
df.to_csv('population_densities.csv')