This code generates the URLs for ma-appellatecourts.org which we will want to download. There is probably no need to run this again unless we need to capture more current cases.

In [None]:
def generate_url(base, case_type, year, number):
    """
    Given a case type, year, and number, generate the URL for it on the MA Appellate Court website
    
    Input:
        base: base of URL
        case type: Type of case in J, P, SJ, and SJC
        year: Year of case
        number: Case number
    Output:
        URL to case
    """
    
    if case_type in ["J", "P"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=1999-P-1
        return base + str(year) + "-" + case_type + "-" + str(number)
    elif case_type in ["SJ"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJ-2011-0500
        return base + case_type + "-" + str(year) + "-" + str(number)
    else: #http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJC-10108
        return base + case_type + "-" + str(number)

base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="

# Number of J cases by year
j_limits = {}
#j_limits[2008] = 547
#j_limits[2009] = 565
#j_limits[2010] = 589
#j_limits[2011] = 550
#j_limits[2012] = 482
#j_limits[2013] = 568
#j_limits[2014] = 514
#j_limits[2015] = 527
#j_limits[2016] = 539
j_limits[2017] = 581
j_limits[2018] = 107

# Number of P cases by year
p_limits = {}
#p_limits[2008] = 2156
#p_limits[2009] = 2354
#p_limits[2010] = 2281
#p_limits[2011] = 2182
#p_limits[2012] = 2023
#p_limits[2013] = 2031
#p_limits[2014] = 1995
#p_limits[2015] = 1755
#p_limits[2016] = 1758
p_limits[2017] = 1634
p_limits[2018] = 343

# Number of SJ cases by year
sj_limits = {}
#sj_limits[2008] = 575
#sj_limits[2009] = 668
#sj_limits[2010] = 586
#sj_limits[2011] = 555
#sj_limits[2012] = 521
#sj_limits[2013] = 503
#sj_limits[2014] = 529
#sj_limits[2015] = 561
#sj_limits[2016] = 536
sj_limits[2017] = 519
sj_limits[2018] = 105

# Lower and upper limit of SJC case numbers within current window (2008-2018)
sjc_lower = 10108
sjc_upper = 12497

# Links to all cases
links = []

# Generate all the links based on the above controls
for year, n in j_limits.items():
    for i in range(n):
        links.append(generate_url(base, "J", year, i + 1))
for year, n in p_limits.items():
    for i in range(n):
        links.append(generate_url(base, "P", year, i + 1))
for year, n in sj_limits.items():
    for i in range(n):
        links.append(generate_url(base, "SJ", year, i + 1))
for i in range(sjc_upper - sjc_lower):
    links.append(generate_url(base, "SJC", 0, i + sjc_lower + 1))

with open("urls_total.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

This code takes the URLs (from a different file than the one written to above- this way, we can limit the scope if we so desire) and pulls the text down for us to keep. Be advised that the operation succeeds even if the page we pull down is "hey you've been blocked so GTFO", so be sure to remove any files that are downloaded and are too small to be court cases (in my case, the minimum size is 7 KB, which is a "this number wasn't found"; most cases are much larger. However, the 'blocked' responses are 3 KB, but your ISP may vary. Still these are probably always smaller than actual court cases). The main loop which controls the page reads also checks if we have the case before we pull it, so there's no need to worry about pulling duplicates (but if we pull a 'blocked' response, we do need to add it back).

In [None]:
import requests
import os
import time

def get_page_text(url):
    """
    Given a URL, return the text content
    
    Input:
        url: a string representing a URL
    Output:
        the content of said URL
    """
    
    headers = {"Connection" : "close"}
    page = ""
    while (page == ""):
        try:
            time.sleep(1)
            page = requests.get(url)
        except:
            time.sleep(5)
            continue
    return page.text

def write_page_text(url, text):
    """
    Write a page's text content to a file
    
    Input:
        url: a string representing the source of the text
        text: the text content
    Output:
        the filename under which the content was written
    """
    filename = r'C:\Users\jcraver\Desktop\BENCHMARKS\%s.html' % url.split('dno=')[-1]
    with open(filename, "w") as text_file:
        print(text, file=text_file)
    return filename

links = set([])
folder = r'C:\Users\jcraver\Desktop\BENCHMARKS'

with open("urls_todo.txt", "r") as text_file:
    for line in text_file:
        links.add(line.strip())

# Get files that have already been done
done = set([])
for file in os.listdir(folder):
    done.add(file)

# Starting from where we left off, pull down pages and write them
# This is to limit what we do at once (if desired)
countdown = 600
processed = []
for link in links:
    processed.append(link)
    # Don't download a file we already have
    if (link.split('dno=')[-1] + '.html') in done:
        continue
    write_page_text(link, get_page_text(link))
    countdown -= 1
    #print(link)
    if countdown <= 0:
        break

# Write down what we've done
for link in processed:
    links.remove(link)
with open("urls_todo.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

This reads in the HTML files from the hard drive (not the web). This will need some work to read the docket entries, but that work can all be done within the scrape_page method; further web queries are probably unnecessary.

In [None]:
import re
import os
from bs4 import BeautifulSoup

def scrape_page(filename):
    """
    Open the file and Soup it as beautifully as possible
    
    Input:
        filename: The filename to parse
    Output:
        A dictionary of the items found in the case page
    """
    soup = BeautifulSoup(open(filename), 'html.parser')
    info = {}
    
    # Get case tags
    header = soup.find('td', class_="largefont")
    if len(list(soup.find_all("td", align="center"))) < 2:
        return {}
    center_cells = list(soup.find_all("td", align="center")[1].stripped_strings)
    info["Court Type"] = header.b.contents[0]
    info["Panel"] = header.b.contents[1].text
    info["Case Name"] = center_cells[0]
    info["Case Id"] = center_cells[-1]
    
    # Get court tags
    tables = soup.find_all("table", class_="lightborder")
    attr_table = tables[0]
    for row in attr_table.find_all("tr", valign="top"): 
        items = row.find_all("b")
        for item in items:
            k = item.text
            v = item.next.next.text.strip()
            info[k] = v
    
    # Get parties
    parties_table = soup.find("table", class_="lightborder", cellpadding="5")
    if parties_table is None:
        return {}
    p_k = set([])
    for row in parties_table.find_all("tr")[1:]:
        k = row.b.nextSibling.next.strip().split('/')[0]
        v = row.b.text.strip()

        if k in info:
            info[k].append(v)
        else: 
            info[k] = [v]
            p_k.add(k)
    for k in p_k:
        info[k] = ", ".join(info[k])
    
    # Get docket entries
    if len(tables) >= 3:
        docket = []
        docket_table = tables[2]
        for row in docket_table.find_all("tr")[1:]:
            items = row.find_all("td")
            date = items[0].text.strip()
            entry = ""
            if len(items) >= 3:
                entry = str(items[2].text.strip())
                entry = re.sub(r"\s+", " ", entry, flags=re.UNICODE)
            docket.append("%s ::: %s" %(date, entry))
        info['Docket Entries'] = " %%% ".join(docket)
    
    return info

base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="
folder = r'C:\Users\jcraver\Desktop\BENCHMARKS'
cases = []
keys = set([])

# Read in all the downloaded pages and print / process them
for file in os.listdir(folder):
    if file.endswith(".html"):
        fullname = os.path.join(folder, file)
        case = scrape_page(fullname)
        case['URL'] = base + file
        keys.update(case.keys())
        cases.append(case)

In [None]:
import unicodecsv as csv

# Write out the csv
with open('cases.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(cases)

This code does not really work. It was intended to download the Lexis cases, but that has not proved fruitful. Left for posterity's sake.

In [None]:
import requests

def get_page_text_with_session(url):
    """
    Given a URL, return the text content
    
    Input:
        url: a string representing a URL
    Output:
        the content of said URL
    """
    
    s = requests.Session()
    
    headers = {"Connection" : "close"}
    page = ""
    cookies = {}
    while (page == ""):
        try:
            time.sleep(1)
            page = s.get(url)
            cookies = dict(page.cookies)
        except:
            time.sleep(1)
            continue
    print(page.text)
    print("roflcopter")
    page = ""
    while (page == ""):
        try:
            time.sleep(1)
            page = s.post(url, verify=False, cookies=cookies)
        except:
            time.sleep(1)
            continue
    print(page.text)

url = 'https://advance.lexis.com/container/?pdmfid=1000516&crid=3df8cfbc-a7e7-4a44-a279-1c90333eda06&pdsearchterms=SJC-11926&pdstartin=hlct%3A1%3A1&pdtypeofsearch=searchboxclick&pdsearchtype=SearchBox&pdqttype=and&pdsf=&pdquerytemplateid=urn%3Aquerytemplate%3A3d60bd7967c1b2f691b2d991f141df6a~%5ESources&pdsourcetype=all&pdparentqt=urn%3Aquerytemplate%3A3d60bd7967c1b2f691b2d991f141df6a~%5ESources&config=00JAA3MTBiMzg0Yi1iY2IxLTQ0ODktYWNlMi02MWEzYjExMThhY2UKAFBvZENhdGFsb2edTHaWI0HCIdzKAqqpyUcJ&ecomp=kg2_9kk&earg=pdsf&prid=a21a0be5-aa4f-4d73-af33-a73415932187'

#print(get_page_text_with_session(url))

def get_lexis_session(url):
    s = requests.session()
    
    return s.get(url)

homeurl = 'https://advance.lexis.com/container?config=0152JAAzMjQ0NTBmOS05ZmRhLTQ4NmQtOTk0NC1lNmQ1MzdmYmZlN2UKAFBvZENhdGFsb2dFqubrO2osfYY9MSag4F39&crid=bc684bfd-28bb-40c7-8178-f00690c843ed&prid=a21a0be5-aa4f-4d73-af33-a73415932187'
r = get_lexis_session(homeurl)
print(r.text)

This would be where we put code to analyze/cluster our ma-appellatecourts.org data

In [None]:
# TODO: implement

# Run LSA on the docket entries (I estimate 2 components, but I haven't checked yet)
# Cluster (k-means?) on tfidf scores

This would be where we put code to analyze/cluster our Lexis opinions

In [None]:
# TODO: implement

# Run LSA on the opinions (I estimate 2 components, but I haven't checked yet)
# Cluster (k-means?) on tfidf scores