# Authenticate Twitter account

In [1]:
import tweepy

In [2]:
from keys import *

auth = tweepy.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(TOKEN, TOKEN_SECRET)

api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Authentication Error")

Authentication OK


# Scrape eBird webpage

In [3]:
import urllib.request
import requests
import time
from bs4 import BeautifulSoup

## Verify a checklist exists

In [4]:
def check_checklist_exists(sub_id):
    '''
    Check that an eBird checklist exists
    
    Args:
        sub_id (int): submission id
    
    Returns:
        url if checklist exists
        False if checklist doesn't exist
    '''
    
    sub_id = 'S' + sub_id
    checklist_start = 'https://ebird.org/view/checklist/'
    url = checklist_start + sub_id
    
    response = requests.get(url)
    
    try:
        assert str(response) == '<Response [200]>'
        return response
    except:
        assert str(response) == '<Response [400]>'
        return False

In [5]:
import random

In [6]:
def get_soup(response):
    return BeautifulSoup(response.text, "html.parser")

numbers = [str(random.randint(0, 59344389)) for _ in range(20)]
soups = []
for number in numbers[:10]:
    time.sleep(1) # Wait between requests so not overloading eBird
    response = check_checklist_exists(number)
    if response: 
        soups.append(get_soup(response))
        
len(soups)

9

## Parse existing checklist

### Get region

In [7]:
def add_to_region(region, to_add, sep = ', '):
    '''
    Append to a string with a separator if necessary
    
    Inputs:
        region: the string to append to
        to_add: the thing to append to the string
        sep: the separator to add, if string
            already contains text
    
    Returns:
        the string with added text
    '''
    
    # If `region` already contains something
    if bool(region):
        region += sep + to_add
        return region
    else:
        region += to_add
        return region

In [8]:
def get_region(soup):
    
    region = ''
    
    # Find the region as linked to in the eBird text
    
    for link in soup.findAll('a'):
        # Find whether region is in the link address
        try:
            region_in = "region" in link['href']
        except:
            # No href tag
            continue

        # If region is in the link address, add it to the region string
        if region_in:
            string = link.string
            # For links in format <a href=''>string</a>
            if(bool(string)):
                region = add_to_region(region, string)

            # For links in format <a href=''><span>string</span></a>
            else:
                string = link.find('span').string
                region = add_to_region(region, string)

    return region

In [9]:
for soup in soups:
    print(get_region(soup))

Dane County, Wisconsin, United States
Florence County, Wisconsin, United States
Montréal County, Quebec, Canada
Aruba, Aruba
Grayson County, Texas, United States
Grand Isle County, Vermont, United States
North Goa County, Goa, India
Barton County, Kansas, United States
Ottawa County, Michigan, United States


### Get number of species

In [10]:
def get_species(soup):
    for span in soup.findAll('span'):
        try:
            span_class = span['class'][0]
        except:
            # No class tag for this span
            continue

        if span_class == 'StatsIcon-stat-count':
            return span.string

In [13]:
for soup in soups:
    print(get_species(soup))

4
49
37
5
66
27
6
9
12
