## Scrape the images you need from the US Congress website and store them locally in a directory us_congress_images

In [7]:
import requests
from bs4 import BeautifulSoup

In [8]:
# Create an array of dicts containing the image url and the party of each member of Congress
urls_and_labels = []
base_url = 'https://www.congress.gov'

In [9]:
# Function to scrape all the image urls and party affiliations

def get_congress_urls(url_string):
    """
    Function takes as input a url from the US Congress website.
    The website displays members of Congress along with their photo
    and party affiliation. The function then extracts the url of the photo 
    (so the photo can be downloaded from that url later) and the party of each 
    member, stores them in a dict, and appends the dict to the urls_and_labels list.
    """
    page = requests.get(url_string)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    congressmen = soup.find_all('li', class_="expanded")
    
    for person in congressmen:
        img_elm = person.find('div', class_='member-image')
        if img_elm == None: # if there's no picture, then don't add the congressmember to the dataset
            continue
        img_src = img_elm.find('img')['src']
        info_elements = person.find('div', class_="member-profile").find_all('span')
        for elm in info_elements:
            if "Party" in elm.text:
                party_text = elm.find('span').text
                if party_text == "Democratic": # Democrats will be coded as 0, Republicans as 1
                    party = 0                # (includes Libertarian Amash as a Republican because he used to be one)
                else:
                    party = 1
                
        urls_and_labels.append({ "url": base_url + img_src, "party": party })

#### *Run the above function on all three urls that list members of Congress*

In [10]:
url_1 = 'https://www.congress.gov/members?q={%22congress%22:%22116%22}&pageSize=250&page=1&searchResultViewType=expanded'
url_2 = 'https://www.congress.gov/members?q=%7B%22congress%22%3A%22116%22%7D&searchResultViewType=expanded&pageSize=250&page=2'
url_3 = 'https://www.congress.gov/members?q=%7B%22congress%22%3A%22116%22%7D&searchResultViewType=expanded&pageSize=250&page=3'

In [11]:
for url in [url_1, url_2, url_3]:
    get_congress_urls(url)

In [12]:
len(urls_and_labels)

537

The list has 537 elements which is odd because there are only 535 members of Congress (435 in the House and 100 Senators). After investigating further, it turns out that in addition to these 535 memebers, there are 6 non-voting memebers of the House from US Territories. This should put the total at 541, but 4 memebers of Congress did not have photos, which explains the final 537 number.

#### *Next, download each image and save it locally*

In [15]:
import time

# Function below downloads each image and save it locally. 
# It also keeps track of how long the download took in seconds.
def download_congress_images(url_list):
    """
    Takes of list of image urls and downloads each image.
    Then it saves each image locally, while strategically
    naming each image according to its political party.
    Later I will exploit the fact that the party is included in
    the file name when I create my labeled dataset.
    """
    start = time.time()
    for member_obj in url_list:
        url = member_obj['url']
        data = requests.get(url).content
        if member_obj['party'] == 0:
            party = 'democrat'
        else:
            party = 'republican'
        with open("us_congress_images/" + party + '_' + url[36:], 'wb+') as f:
            f.write(data)
    f.close()
    end = time.time()
    print(end - start, ' seconds')

In [16]:
download_congress_images(urls_and_labels)

132.96079802513123  seconds


### Now all the images have been downloaded and you can start building dataset and the model.