In [4]:
def generate_url(case_type, year, number):
    """
    Given a case type, year, and number, generate the URL for it on the MA Appellate Court website
    
    Input:
        case type: Type of case in J, P, SJ, and SJC
        year: Year of case
        number: Case number
    Output:
        URL to case
    """
    
    base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="
    if case_type in ["J", "P"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=1999-P-1
        return base + str(year) + "-" + case_type + "-" + str(number)
    elif case_type in ["SJ"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJ-2011-0500
        return base + case_type + "-" + str(year) + "-" + str(number)
    else: #http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJC-10108
        return base + case_type + "-" + str(number)

# Number of J cases by year
j_limits = {}
j_limits[2008] = 547
j_limits[2009] = 565
j_limits[2010] = 589
j_limits[2011] = 550
j_limits[2012] = 482
j_limits[2013] = 568
j_limits[2014] = 514
j_limits[2015] = 527
j_limits[2016] = 539
j_limits[2017] = 581
j_limits[2018] = 107

# Number of P cases by year
p_limits = {}
p_limits[2008] = 2156
p_limits[2009] = 2354
p_limits[2010] = 2281
p_limits[2011] = 2182
p_limits[2012] = 2023
p_limits[2013] = 2031
p_limits[2014] = 1995
p_limits[2015] = 1755
p_limits[2016] = 1758
p_limits[2017] = 1634
p_limits[2018] = 343

# Number of SJ cases by year
sj_limits = {}
sj_limits[2008] = 575
sj_limits[2009] = 668
sj_limits[2010] = 586
sj_limits[2011] = 555
sj_limits[2012] = 521
sj_limits[2013] = 503
sj_limits[2014] = 529
sj_limits[2015] = 561
sj_limits[2016] = 536
sj_limits[2017] = 519
sj_limits[2018] = 105

# Lower and upper limit of SJC case numbers within current window (2008-2018)
sjc_lower = 10108
sjc_upper = 12497

# Links to all cases
links = []

# Generate all the links based on the above controls
for year, n in j_limits.items():
    for i in range(n):
        links.append(generate_url("J", year, i + 1))
for year, n in p_limits.items():
    for i in range(n):
        links.append(generate_url("P", year, i + 1))
for year, n in sj_limits.items():
    for i in range(n):
        links.append(generate_url("SJ", year, i + 1))
for i in range(sjc_upper - sjc_lower):
    links.append(generate_url("SJC", 0, i + sjc_lower + 1))

with open("urls_total.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

In [5]:
import requests

def get_page_text(url):
    """
    Given a URL, return the text content
    
    Input:
        url: a string representing a URL
    Output:
        the content of said URL
    """
    
    headers = {"Connection" : "close"}
    page = ""
    while (page == ""):
        try:
            # time.sleep(1)
            page = requests.get(url, headers)
        except:
            time.sleep(1)
            continue
    return page.text

def write_page_text(url, text):
    """
    Write a page's text content to a file
    
    Input:
        url: a string representing the source of the text
        text: the text content
    Output:
        the filename under which the content was written
    """
    filename = r'C:\Users\jcraver\Desktop\BENCHMARKS\%s.html' % url.split('dno=')[-1]
    with open(filename, "w") as text_file:
        print(text, file=text_file)
    return filename

links = set([])

with open("urls_todo.txt", "r") as text_file:
    for line in text_file:
        links.add(line.strip())

# Starting from where we left off, pull down pages and write them
# This is to limit what we do at once (if desired)
done = set([])
countdown = 2
for link in links:
    write_page_text(link, get_page_text(link))
    countdown -= 1
    done.add(link)
    if countdown <= 0:
        break

# Write down what we've done
links.difference_update(done)
with open("urls_done.txt", "a") as text_file:
    for link in done:
        print(done, file=text_file)
with open("urls_todo.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

FileNotFoundError: [Errno 2] No such file or directory: 'urls_todo.txt'

In [None]:
from bs4 import BeautifulSoup

def scrape_page(filename):
    """
    Open the file and Soup it as beautifully as possible
    
    Input:
        filename: The filename to parse
    Output:
        A dictionary of the terms found in the case page
    """
    soup = BeautifulSoup(open(filename), 'html.parser')
    info = {}
    
    # Get case tags
    header = soup.find('td', class_="largefont")
    center_cells = list(soup.find_all("td", align="center")[1].stripped_strings)
    info["Court Type"] = header.b.contents[0]
    info["Panel"] = header.b.contents[1].text
    info["Case Name"] = center_cells[0]
    info["Case Id"] = center_cells[-1]
    
    # Get court tags
    tables = soup.find_all("table", class_="lightborder")
    attr_table = tables[0]
    for row in attr_table.find_all("tr", valign="top"): 
        items = row.find_all("b")
        for item in items:
            k = item.text
            v = item.next.next.text.strip()
            info[k] = v
    
    # Get parties
    parties_table = soup.find("table", class_="lightborder", cellpadding="5")
    if parties_table is None:
        return {}
    for row in parties_table.find_all("tr")[1:]:
        k = row.b.nextSibling.next.strip().split('/')[0]
        v = row.b.text.strip()

        if k in info:
            info[k].append(v)
        else: 
            info[k] = [v]
    return info

# Read in all the downloaded pages and print / process them
import os
for file in os.listdir(r'C:\Users\jcraver\Desktop\BENCHMARKS'):
    if file.endswith(".html"):
        fullname = os.path.join(r'C:\Users\jcraver\Desktop\BENCHMARKS', file)
        print(fullname)
        print(scrape_page(fullname))