In [1]:
import base64
import requests
import textract
import openpyxl
import urllib
import os
from api import API_KEY

def try_download(link):
    
    # Some possible extensions
    if link.endswith('.html') or link.endswith('.htm'):
        ext = 'html'
    elif '.pdf' in link or '=pdf' in link:
        ext = 'pdf'
    elif '.aspx' in link:
        ext = 'aspx'
    elif '.docx' in link:
        ext = 'docx'
        
    # Default to html?
    else:
        ext = 'html'
      
    # Downloads locally under dl_name
    dl_name = f'temp.{ext}'
    os.system(f'wget -O {dl_name} {link} --no-check-certificate')
    
    return dl_name

def load_links(sheet=1):
    
    # Need to load with openpyxl to retain hyper-links
    wb = openpyxl.load_workbook('live.xlsx')
    
    # Get's all sheets
    sheets = wb.sheetnames
    
    # Select the passed sheet by int index
    ws = wb[sheets[sheet]]
    
    # Populate a dictionary of links
    # where dict key corresponds to int index
    # in the table
    links = {}
    for i in range(1, ws.max_row+1):
        
        try:
            link = ws.cell(row=i, column=2).hyperlink.target
            links[i-2] = link
            
        # Skip any without links
        except AttributeError:
            pass

    return links

def url_to_bill_id(url):
    '''In some cases need to go from base bill url to find bill id'''
    
    # Extract state name from url
    sp = url.split('/')
    ind = sp.index('legiscan.com')
    state = sp[ind+1]
    
    # Grab masterlist from api for this state
    response = requests.get(f'https://api.legiscan.com/?key={API_KEY}&op=getMasterList&state={state}')
    master_list = response.json()['masterlist']
    
    # Generate mapping from url to bill id
    url_mapping = {}
    for i in master_list:
        entry = master_list[i]
        if 'url' in entry:
            
            # Lowercase url to bill id
            url_mapping[entry['url'].lower()] = entry['bill_id']
     
    # Then using the url mapping, can find the bill id needed for download
    return url_mapping[url.replace('text', 'bill')]

def load_legiscan_bill_text(bill_id):
    
    # Use legiscan api to get bill text
    response = requests.get(f"https://api.legiscan.com/?key={API_KEY}&op=getBillText&id={bill_id}")

    # Decode
    decoded = base64.b64decode(response.json()['text']['doc'])
    
    # Figure out if returned as html or pdf
    resp_type = response.json()['text']['mime']
    if 'html' in resp_type:
        ext = 'html'
    elif 'pdf' in resp_type:
        ext = 'pdf'
    else:
        raise RuntimeError('Unknown ext:', resp_type)
        
    # Save as correct type
    with open(f'temp.{ext}', 'wb') as f:
        f.write(decoded)
        
    # Load the text w/ textract library
    text = textract.process(f'temp.{ext}')
    
    # Remove temp file when done
    os.remove(f'temp.{ext}') 
    
    return text

def load_site_text(url):
    
    # Try downloading the url directly
    file = try_download(url)

    # If did not download, return None
    if not os.path.exists(file):
        return None
    
    # Try to extract the text from the file
    try:
        text = textract.process(file)
    except:
        text = None

    # Remove temp file when done
    os.remove(file) 
    return text

def process_link(link):
    
    # Lower case links
    original_link = link
    link = link.lower()
    
    # CASE 1 is if the bill is hosted on legiscan
    # In this case we will try to use their api to get the file
    if 'legiscan' in link:
        
        # Some links already contain the bill ID in the url
        sp = link.split('/')
        if 'id' in sp:
            bill_id = sp[sp.index('id')+1]
            return load_legiscan_bill_text(bill_id)
            
        # Others, we need to manually
        else:
            return None
    
    # Otherwise, try just base download
    # then extract
    text = load_site_text(link)
    
    # Try with non lower case also if None
    if text is None:
        text = load_site_text(original_link)
    return text

In [2]:
# Make sure have reference xlsx file, but avoid re-downloading
if not os.path.exists('live.xlsx'):
    os.system('wget https://raw.githubusercontent.com/sahahn/gag_orders/master/data/live.xlsx')
    
# Init directory to save bill text
os.makedirs('bill_text', exist_ok=True)

# Only need to use sheet 1, seems to contain everything
links = load_links(sheet=1)

# Iterate through each link and download if not already
for i in links:
    
    # Skip already saved
    save_loc = os.path.join('bill_text', f'{i}.txt')
    if os.path.exists(save_loc):
        continue
        
    link = links[i]
    
    # Try to load bill text
    text = process_link(link)
    
    # If found, save
    if text is not None:
        
        # Save just as text
        with open(save_loc, 'w') as f:
            f.write(text.decode("utf-8"))
        print('Saved:', link, i, len(text))
    
    # Otherwise, indicate that there was an error
    # scraping this file
    else:
        print('Error with', link, i)