## A Comparative Study of Regional Air Quality

This notebook uses webscraping to collect pollen and allergen data for a comparative study of air quality between five cities with three significantly different terrains and two levels of population density:
1. San Diego, California (semi-arid, coastal, 62 ft elevation, xxx population);
2. Los Angeles, California 
3. Denver, Colorado (semi-arid, mountainous, 5414 ft elevation);
4. Atlanta, Georgia
5. Nashville, Tennessee (humid subtropical, forested, 597 ft elevation).

The data gathered in this notebook is available at: https://www.pollen.com/research/. The resulting data is saved as a DataFrame and exported as a CSV file titled _'pollen_data.csv'_.

### Import the Required Libraries

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time

### Set up the Selenium Web Driver to Iterate through the Separate Charts for Each Season and Pollen Type

In [2]:
driver = webdriver.Chrome()

### Declare the Static Variables

In [3]:
# XPATHS for each season
seasons = {'Spring':'//*[@id="seasonlist"]/li[1]',
           'Summer':'//*[@id="seasonlist"]/li[2]',
           'Fall':'//*[@id="seasonlist"]/li[3]',
           'Winter':'//*[@id="seasonlist"]/li[4]'}

# XPATHS for each pollen type
pollens = {'Tree':'//*[@id="pollenlist"]/li[1]',
           'Grass':'//*[@id="pollenlist"]/li[2]',
           'Ragweed':'//*[@id="pollenlist"]/li[3]'}

In [4]:
# A zip codes dictionary with zip codes as keys and city names as the values
zip_dict = {'98101':'Seattle, WA',
            '90001':'Los Angeles, CA',
            '85001':'Phoenix, AZ',
            '55401':'Minneapolis, MN',
            '80201':'Denver, CO',
            '73301':'Austin, TX',
            '10001':'New York, NY',
            '37201':'Nashville, TN',
            '32099':'Jacksonville, FL'}

state_dict = {'WA': 'Washington',
              'CA': 'California',
              'AZ': 'Arizona',
              'MN': 'Minnesota',
              'CO': 'Colorado',
              'TX': 'Texas',
              'NY': 'New York',
              'TN': 'Tennessee',
              'FL': 'Florida'}

In [5]:
# The common url
url = 'https://www.pollen.com/research/'

### Loop through the Charts

In [6]:
# Initiate an empty DataFrame
pollen_df = pd.DataFrame(columns=['city', 'season', 'category', 'species', 'allergenicity'])

# Loop through each city's page
for code in zip_dict:
    
    city = zip_dict[code]
    
    driver.get(url + code)
    assert city in driver.title
    time.sleep(5)
    
    # Loop through each season
    for season in seasons:
        
        # Select the season
        driver.find_element_by_xpath(seasons[season]).click()
        time.sleep(2)

        # Loop through each pollen category
        for category in pollens:
            
            # Select the category
            driver.find_element_by_xpath(pollens[category]).click()
            time.sleep(2)
            
            # Refresh the page content and locate the relevant element
            page_content = bs(driver.page_source)
            species_divs = page_content.findAll(name='div', attrs={'class':'col-sm-6 no-padding'})

            # Loop through each species in the chart
            for div in species_divs:
                species = div.find('a').text
                allergenicity = div.find('div').get('class')[1].title()

                # Add the entry to the collective DataFrame
                pollen_df.loc[len(pollen_df)] = [city, season, category, species, allergenicity]
                
pollen_df

Unnamed: 0,city,season,category,species,allergenicity
0,"Seattle, WA",Spring,Tree,Black Walnut (Juglans nigra),Severe
1,"Seattle, WA",Spring,Tree,Coastal Willow (Salix hookeriana),Severe
2,"Seattle, WA",Spring,Tree,European Privet (Ligustrum vulgare),Severe
3,"Seattle, WA",Spring,Tree,Geyer's Willow (Salix geyeriana),Severe
4,"Seattle, WA",Spring,Tree,Green Ash (Fraxinus pennsylvanica),Severe
...,...,...,...,...,...
935,"Jacksonville, FL",Winter,Ragweed,Florida Pellitory (Parietaria floridana),Severe
936,"Jacksonville, FL",Winter,Ragweed,Saltwater False Willow (Baccharis angust...,Severe
937,"Jacksonville, FL",Winter,Ragweed,Silverling (Baccharis glomeruliflora),Severe
938,"Jacksonville, FL",Winter,Ragweed,Small-Head Marsh-Elder (Iva microcephala...,Severe


#### Split the City Column

In [7]:
pollen_df[['city', 'state']] = pollen_df['city'].str.split(', ', expand=True)
pollen_df['state'] = [state_dict[row.state] for index, row in pollen_df.iterrows()]

In [8]:
pollen_df = pollen_df[['city', 'state', 'season', 'category', 'species', 'allergenicity']]

### Close out the Driver

In [9]:
driver.close()

### Export the DataFrame

In [10]:
pollen_df.to_excel(r'../data/pollen_data.xlsx', sheet_name='pollen_data', index=False)