## Webscraping for Regional Pollen Data

This notebook uses webscraping to collect pollen and allergen data for a comparative study of air quality between nine cities with different topographies, elevations, population densities and flora:

1.   Seattle, Washington
2.   San Diego, California
3.   Phoenix, Arizona
4.   Minneapolis, Minnesota
5.   Denver, Colorado
6.   Austin, Texas
7.   Philadelphia, Pennsylvania
8.   Nashville, Tennessee
9.   Jacksonville, Florida

The data gathered in this notebook is available at: https://www.pollen.com/research/. The resulting data is saved as a DataFrame and exported as a CSV file titled _'pollen_data.csv'_ and an Excel file titled _'pollen_data.xlsx'_.

### Import the Required Libraries

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time

### Set up the Selenium Web Driver to Iterate through the Separate Charts for Each Season and Pollen Type

In [None]:
driver = webdriver.Chrome()

### Declare the Static Variables

In [None]:
# XPATHS for each season
seasons = {'Spring':'//*[@id="seasonlist"]/li[1]',
           'Summer':'//*[@id="seasonlist"]/li[2]',
           'Fall':'//*[@id="seasonlist"]/li[3]',
           'Winter':'//*[@id="seasonlist"]/li[4]'}

# XPATHS for each pollen type
pollens = {'Tree':'//*[@id="pollenlist"]/li[1]',
           'Grass':'//*[@id="pollenlist"]/li[2]',
           'Ragweed':'//*[@id="pollenlist"]/li[3]'}

In [None]:
# A zip codes dictionary with zip codes as keys and city names as the values
zip_dict = {'98101':'Seattle, WA',
            '92101':'San Diego, CA',
            '85001':'Phoenix, AZ',
            '55401':'Minneapolis, MN',
            '80201':'Denver, CO',
            '73301':'Austin, TX',
            '19019':'Philadelphia, PA',
            '37201':'Nashville, TN',
            '32099':'Jacksonville, FL'}

state_dict = {'WA': 'Washington',
              'CA': 'California',
              'AZ': 'Arizona',
              'MN': 'Minnesota',
              'CO': 'Colorado',
              'TX': 'Texas',
              'PA': 'Pennsylvania',
              'TN': 'Tennessee',
              'FL': 'Florida'}

In [None]:
# The common url
url = 'https://www.pollen.com/research/'

### Loop through the Charts

In [None]:
# Initiate an empty DataFrame
pollen_df = pd.DataFrame(columns=['city', 'season', 'category', 'species', 'allergenicity'])

# Loop through each city's page
for code in zip_dict:
    
    city = zip_dict[code]
    
    driver.get(url + code)
    assert city in driver.title
    time.sleep(5)
    
    # Loop through each season
    for season in seasons:
        
        # Select the season
        driver.find_element_by_xpath(seasons[season]).click()
        time.sleep(2)

        # Loop through each pollen category
        for category in pollens:
            
            # Select the category
            driver.find_element_by_xpath(pollens[category]).click()
            time.sleep(2)
            
            # Refresh the page content and locate the relevant element
            page_content = bs(driver.page_source)
            species_divs = page_content.findAll(name='div', attrs={'class':'col-sm-6 no-padding'})

            # Loop through each species in the chart
            for div in species_divs:
                species = div.find('a').text
                allergenicity = div.find('div').get('class')[1].title()

                # Add the entry to the collective DataFrame
                pollen_df.loc[len(pollen_df)] = [city, season, category, species, allergenicity]

#### Split the City Column

In [None]:
pollen_df[['city', 'state']] = pollen_df['city'].str.split(', ', expand=True)
pollen_df['state'] = [state_dict[row.state] for index, row in pollen_df.iterrows()]

In [None]:
pollen_df = pollen_df[['city', 'state', 'season', 'category', 'species', 'allergenicity']]

In [None]:
pollen_df

### Close the Driver

In [None]:
driver.close()

### Export the DataFrame

In [None]:
pollen_df.to_csv(r'../data/pollen_data.csv', index=False)
pollen_df.to_excel(r'../data/pollen_data.xlsx', sheet_name='pollen_data', index=False)