In [None]:
from urllib.request import urlopen
import requests
from lxml import html
from time import time

## scenario 1: no multithreading
the current approach fetches data from govinfo.gov *sequentially*. 

In [None]:
N_DOCUMENTS = 20  # number of documents to crawl
xml_range = range(1, N_DOCUMENTS)

In [None]:
bill_rollnum = []
bill_chamber = []
bill_action = []
bill_date = []
bill_voteurl = []

start = time()
for i in xml_range:
    print("fetching {}".format(i))
    bill_url = 'https://www.govinfo.gov/bulkdata/BILLSTATUS/115/hr/BILLSTATUS-115hr' + str(i) + '.xml'
    bill_source = requests.get(bill_url)
    bill_ntree = html.document_fromstring(bill_source.content)
    
    bill_rollnum_text = bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/rollnumber/text()')
    bill_rollnum.append(bill_rollnum_text)
    bill_chamber_text = bill_ntree.xpath('//recordedvote/chamber[text()="House"]/text()')
    bill_chamber.append(bill_chamber_text)
    bill_action_text = bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/fullactionname/text()')
    bill_action.append(bill_action_text)
    bill_date_text = bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/date/text()')
    bill_date.append(bill_date_text)
    bill_voteurl_text = bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/url/text()')
    bill_voteurl.append(bill_voteurl_text)
print("execution time", time() - start)

## scenario 2: with multithreading
this approach uses multiple threads to fetch data. threads can be reused.

In [None]:
from multiprocessing import Manager
from concurrent.futures import ThreadPoolExecutor
from threading import current_thread

In [None]:
bill_rollnum_temp = []  # temp list for comparison - NEED TO REMOVE TEMP
bill_chamber = []
bill_action = []
bill_date = []
bill_voteurl = []

def crawl(i):
    # define url path
    bill_url = 'https://www.govinfo.gov/bulkdata/BILLSTATUS/115/hr/BILLSTATUS-115hr' + str(i) + '.xml'
    bill_source = requests.get(bill_url)
    bill_ntree = html.document_fromstring(bill_source.content)
    
    # grab roll numbers
    bill_rollnum_temp.append(bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/rollnumber/text()'))
    
    # grab chamber text
    bill_chamber.append(bill_ntree.xpath('//recordedvote/chamber[text()="House"]/text()'))
    
    # grab action text
    bill_action.append(bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/fullactionname/text()'))
    
    # grab dates
    bill_date.append(bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/date/text()'))
    
    # grab voteurl
    bill_voteurl.append(bill_ntree.xpath('//recordedvote[descendant::chamber/text()="House"]/url/text()'))

In [None]:
N_THREADS = 10  # K threads means K x faster

start = time()

with ThreadPoolExecutor(max_workers=N_THREADS) as executor:
    for idx in xml_range:  
        _ = executor.submit(crawl, idx)

print("execution time", time() - start)