## A Comparative Study of Regional Air Quality

This notebook uses webscraping to collect general city data for a comparative study of air quality between five cities with three significantly different terrains and two levels of population density:
1. San Diego, California
2. Los Angeles, California 
3. Denver, Colorado
4. Atlanta, Georgia
5. Nashville, Tennessee

The data gathered in this notebook is available at: https://en.wikipedia.org/wiki/. The resulting data is saved as a DataFrame and exported as a CSV file titled _'city_data.csv'_.

NOTES:<br>
- The 'metro' population value is used in this analysis. This value is defined as 'A region that consists of a densely populated urban agglomeration and its surrounding territories sharing industries, commercial areas, transport network, infrastructures and housing. Metropolitan areas typically include satellite cities, towns and intervening rural areas that are socioeconomically tied to the principal cities or urban core, often measured by commuting patterns.<br>
>(_Source: https://en.wikipedia.org/wiki/Metropolitan_area_)
- The city's population rank is based on wikipedia's "List of United States cities by population".<br>
>(_Source: https://en.wikipedia.org//wiki/List_of_United_States_cities_by_population_)

### Import the Required Libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import re

### Declare the Static Variables

In [2]:
# The url ids for each city
city_id = {'San Diego':'San_Diego',
           'Los Angeles':'Los_Angeles',
           'Denver':'Denver',
           'Atlanta':'Atlanta',
           'Nashville':'Nashville'}

In [3]:
# The common url
url = 'https://en.wikipedia.org/wiki/'

### Loop through Each City's Page

In [4]:
# Initiate empty list to add/compare each page's data to
data_list = []
climate_types = []
i=0
# Loop through each city's page
for city in city_id:
    
    wiki_html = requests.get(url + city_id[city])
    wiki_html = bs(wiki_html.text)
    assert city in wiki_html.find('title').text
    
    # Save the relevant table and its rows
    table_data = wiki_html.find('table', attrs={'class':'ib-settlement'})
    table_rows = table_data.findAll('tr')
    
    data_dict = {}
   
    # Extract the city data (remove any state datarmation)
    data_dict['city'] = table_data.find('div', attrs={'class':'fn org'}).text.split(',')[0]

    # Extract the state, county, population and elevation data
    for row in table_rows:
        headers= row.findAll('th')
        for header in headers:
            if re.fullmatch('State', header.text, flags=re.IGNORECASE):
                # Extract the state
                data_dict['state'] = row.find('td').text
            elif re.fullmatch('County|Counties|City and County', header.text, flags=re.IGNORECASE):
                # Extract the county (removing unnecessary characters)
                #data_dict['county'] = row.find('td').text.split('[')[0].split(' County')[0]
                data_dict['county'] = re.sub('[\[\d\]]| County', '', row.find('td').text)
            elif header.text.find('Population') != -1:
                # Extract the census year
                data_dict['pop_year'] = row.find('a').text.replace(',', '')
            elif header.text.find('Metro') != -1:
                # Extract the metro population and US population rank
                data_dict['pop_metro'] = round(int(row.find('td').text.split(' (')[0].replace(',', '')), -3)
                data_dict['pop_rank'] = row.find('td').text.split(' (')[1][:-3].replace(',', '')
            elif header.text.find('Density') != -1:
                # Extract the population density (account for UNICODE and retain only standard measurement)
                data_dict['pop_density_sqmi'] = int(round(float(row.find('td').text.replace('\xa0', ' ').replace(',', '').split('/sq mi')[0]), -1))
            elif header.text.find('Land') != -1:
                # Extract the land area
                data_dict['city_area_sqmi'] = int(round(float(row.find('td').text.replace('\xa0', ' ').split(' sq')[0].replace(',', '')), -1))
            elif header.text.find('Elevation') != -1:
                # Extract the USGS elevation (account for UNICODE and retain only standard measurement)
                elev_text = row.find('td').text.replace('\xa0', ' ').split(' ft')[0].replace(',', '')
                # When the elevation uses a range, use the average
                if '–' in elev_text:
                    data_dict['elevation_ft'] = int(round(np.mean([int(elev_text.split('–')[0]), int(elev_text.split('–')[1])]), -1))
                elif ' to ' in elev_text:
                    data_dict['elevation_ft'] = int(round(np.mean([int(elev_text.split(' to ')[0]), int(elev_text.split(' to ')[1])]), -1))
                else:
                    data_dict['elevation_ft'] = int(round(int(elev_text), -1))

    # Extract the climate data, accounting for multiple variations per city
    html_links = wiki_html.findAll('a')
    city_climates = []
    for link in html_links:
        link_title = link.get('title')
        if link_title:
            match = re.fullmatch('.+\sclimate', link.text, flags=re.IGNORECASE)
            if match:
                city_climates.append(link.text[:-8].title())
                
    # Compare the city's climate type(s) to existing ones, giving precedence to common types
    if len(city_climates) == 1:
        # City has only one climate type, so just add it
        data_dict['climate'] = city_climates[0].replace('Continental ', '')
        climate_types.append(city_climates[0].replace('Continental ', ''))
    elif len(city_climates) > 1:
        # City has multiple (synonymous) climate types, so look for common existing types
        common_climate = [climate for climate in city_climates if climate in climate_types]
        if len(common_climate) == 0:
            data_dict['climate'] = city_climates[0]
            climate_types.append(city_climates[0])
        else:
            data_dict['climate'] = common_climate[0]   
    
    data_list.append(data_dict)
    i+=1
city_data = pd.DataFrame(data_list)
city_data

Unnamed: 0,city,state,county,city_area_sqmi,elevation_ft,pop_year,pop_density_sqmi,pop_metro,pop_rank,climate
0,San Diego,California,San Diego,330,60,2020,4260,3299000,17,Semi-Arid
1,Los Angeles,California,Los Angeles,470,300,2020,8300,13201000,2,Semi-Arid
2,Denver,Colorado,Denver,150,5410,2020,4670,2964000,19,Semi-Arid
3,Atlanta,Georgia,"Fulton, DeKalb",140,890,2020,3690,6144000,8,Humid Subtropical
4,Nashville,Tennessee,Davidson,500,600,2020,1420,1990000,36,Humid Subtropical


### Export the Dataframe

In [5]:
city_data.to_excel(r'../data/city_data.xlsx', sheet_name='city_data', index=False)