## A Comparative Study of Regional Air Quality

This notebook uses webscraping to collect general city data for a comparative study of air quality between five cities with three significantly different terrains and two levels of population density:
1. San Diego, California (semi-arid, coastal, 62 ft elevation, xxx population);
2. Los Angeles, California 
3. Denver, Colorado (semi-arid, mountainous, 5414 ft elevation);
4. Atlanta, Georgia
5. Nashville, Tennessee (humid subtropical, forested, 597 ft elevation).

The data gathered in this notebook is available at: https://en.wikipedia.org/wiki/.

### Import the Required Libraries

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
import time
import string

### Declare the Static Variables

In [2]:
# The url ids for each city
city_id = {'San Diego':'San_Diego',
           'Los Angeles':'Los_Angeles',
           'Denver':'Denver',
           'Atlanta':'Atlanta',
           'Nashville':'Nashville'}

In [3]:
# The tags in the sourcecode
header_text = {'county':'County',
               'pop_total':'Population', 
               'pop_density':'Density', 
               'elevation':'Elevation',
               'climate':'Climate'} # a title="SOMETHING SOMETHING climate"
# 'city' in div class="fn org"'
# 'usgs_year' in sup id="cite_ref-usgs_7-0", class="reference" UNDER THE ELEVATION HEADER
# 'pop_year' in div class='ib-settlement-fn' -- span class="nowrap" -- a (text) UNDER POPULATION HEADER

In [4]:
# The common url
url = 'https://en.wikipedia.org/wiki/'

In [5]:
list(city_id.values())[0]

'San_Diego'

In [6]:
list(city_id.keys())[0]

'San Diego'

In [7]:
urltest=url+list(city_id.values())[0] # Test for San_Diego only

wiki_html = requests.get(urltest)

In [8]:
print('Status Code:',wiki_html.status_code)
print('Status Type:',type(wiki_html))
print('Code:')
#print(wiki_html.text)

Status Code: 200
Status Type: <class 'requests.models.Response'>
Code:


In [9]:
wiki_html = bs(wiki_html.text)
#wiki_html
#time.sleep(2)

In [10]:
wiki_html.find('title')

<title>San Diego - Wikipedia</title>

In [11]:
wiki_html.find('title').text

'San Diego - Wikipedia'

In [12]:
# Verify the city is correct
assert list(city_id.keys())[0] in wiki_html.find('title').text

# Initiate an empty DataFrame
city_info = pd.DataFrame(columns=['city', 'county', 'pop_year', 'pop_total', 'pop_density', 'usgs_year', 'elevation', 'climate'])

city_info

Unnamed: 0,city,county,pop_year,pop_total,pop_density,usgs_year,elevation,climate


The tags in the sourcecode
header_text = {'county':'County',
               'pop_total':'Population', 
               'pop_density':'Density', 
               'elevation':'Elevation',
               'climate':'Climate'} # a title="SOMETHING SOMETHING climate"
#'city' in div class="fn org"'
#'usgs_year' in sup id="cite_ref-usgs_7-0", class="reference" UNDER THE ELEVATION HEADER
 'pop_year' in div class='ib-settlement-fn' -- span class="nowrap" -- a (text) UNDER POPULATION HEADER

In [13]:
table_info = wiki_html.findAll('table', attrs={'class':'ib-settlement'})[0]

In [14]:
table_rows = table_info.findAll('tr')

In [15]:
table_rows[27]

<tr class="mergedtoprow"><th class="infobox-label" scope="row">Elevation<div class="ib-settlement-fn"><sup class="reference" id="cite_ref-usgs_7-0"><a href="#cite_note-usgs-7">[7]</a></sup></div></th><td class="infobox-data">62 ft (19 m)</td></tr>

In [16]:
table_rows[0].findAll('th')

[<th class="infobox-above" colspan="2"><div class="fn org">San Diego</div></th>]

In [17]:
any(city_id.keys())

True

In [19]:
#table_rows[27].find('td').text.replace('\xa0', ' ')
table_rows[33].find('td').text.replace('\xa0', ' ')

'4,255.96/sq mi (1,643.25/km2)'

In [20]:
table_info.find('div', attrs={'class':'fn org'}).text

'San Diego'

In [24]:
info = {}
info['city'] = table_info.find('div', attrs={'class':'fn org'}).text
for row in table_rows:
    headers = row.findAll('th')
    for h in headers:
        if h.text.find('Population') != -1:
            info['pop_year'] = row.find('a').text
        elif h.text.find('Total') != -1:
            info['pop_total'] = row.find('td').text
        elif h.text.find('Density') != -1:
            info['pop_density'] = row.find('td').text.replace('\xa0', ' ')
        elif h.text.find('Elevation') != -1:
            info['elevation'] = row.find('td').text.replace('\xa0', ' ')

 
#    if (header.text == 'County') | (header.text == 'City and county'):
#        info['county'] = 's'

In [25]:
print(info)

{'city': 'San Diego', 'pop_total': '1,386,932', 'elevation': '62 ft (19 m)', 'pop_year': '2020', 'pop_density': '4,255.96/sq mi (1,643.25/km2)'}


In [26]:
city_info.columns

Index(['city', 'county', 'pop_year', 'pop_total', 'pop_density', 'usgs_year',
       'elevation', 'climate'],
      dtype='object')