This code generates the URLs for ma-appellatecourts.org which we will want to download. There is probably no need to run this again unless we need to capture more current cases.

In [None]:
def generate_url(base, case_type, year, number):
    """
    Given a case type, year, and number, generate the URL for it on the MA Appellate Court website
    
    Input:
        base: base of URL
        case type: Type of case in J, P, SJ, and SJC
        year: Year of case
        number: Case number
    Output:
        URL to case
    """
    
    if case_type in ["J", "P"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=1999-P-1
        return base + str(year) + "-" + case_type + "-" + str(number)
    elif case_type in ["SJ"]: # format: http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJ-2011-0500
        return base + case_type + "-" + str(year) + "-" + str(number)
    else: #http://www.ma-appellatecourts.org/display_docket.php?src=party&dno=SJC-10108
        return base + case_type + "-" + str(number)

base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="

# Number of J cases by year
j_limits = {}
j_limits[2008] = 547
j_limits[2009] = 565
j_limits[2010] = 589
j_limits[2011] = 550
j_limits[2012] = 482
j_limits[2013] = 568
j_limits[2014] = 514
j_limits[2015] = 527
j_limits[2016] = 539
j_limits[2017] = 581
j_limits[2018] = 130

# Number of P cases by year
p_limits = {}
p_limits[2008] = 2156
p_limits[2009] = 2354
p_limits[2010] = 2281
p_limits[2011] = 2182
p_limits[2012] = 2023
p_limits[2013] = 2031
p_limits[2014] = 1995
p_limits[2015] = 1755
p_limits[2016] = 1758
p_limits[2017] = 1634
p_limits[2018] = 365

# Number of SJ cases by year
sj_limits = {}
sj_limits[2008] = 575
sj_limits[2009] = 668
sj_limits[2010] = 586
sj_limits[2011] = 555
sj_limits[2012] = 521
sj_limits[2013] = 503
sj_limits[2014] = 529
sj_limits[2015] = 561
sj_limits[2016] = 536
sj_limits[2017] = 511
sj_limits[2018] = 125

# Lower and upper limit of SJC case numbers within current window (2008-2018)
sjc_lower = 10108
sjc_upper = 12510

# Links to all cases
links = []

# Generate all the links based on the above controls
for year, n in j_limits.items():
    for i in range(n):
        links.append(generate_url(base, "J", year, i + 1))
for year, n in p_limits.items():
    for i in range(n):
        links.append(generate_url(base, "P", year, i + 1))
for year, n in sj_limits.items():
    for i in range(n):
        links.append(generate_url(base, "SJ", year, i + 1))
for i in range(sjc_upper - sjc_lower):
    links.append(generate_url(base, "SJC", 0, i + sjc_lower + 1))

with open("urls_todo.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

This code takes the URLs (from a different file than the one written to above- this way, we can limit the scope if we so desire) and pulls the text down for us to keep. Be advised that the operation succeeds even if the page we pull down is "you've been blocked", so be sure to remove any files that are downloaded and are too small to be court cases (in my case, the minimum size is 7 KB, which is a "this number wasn't found"; most cases are much larger. However, the 'blocked' responses are 3 KB, but your ISP may vary. Still these are probably always smaller than actual court cases). The main loop which controls the page reads also checks if we have the case before we pull it, so there's no need to worry about pulling duplicates (but if we pull a 'blocked' response, we do need to add it back).

In [None]:
import requests
import os
import sys
import time
import random

# List of user agents to choose from
useragents = '''Mozilla/5.0 (Windows; U; ; en-NZ) AppleWebKit/527  (KHTML, like Gecko, Safari/419.3) Arora/0.8.0
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser; Avant Browser; .NET CLR 1.0.3705; .NET CLR 1.1.4322; Media Center PC 4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Beamrise/17.2.0.9 Chrome/17.0.939.0 Safari/535.8
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/28.0.1469.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36
Mozilla/5.0 (Windows NT 6.0; rv:14.0) Gecko/20100101 Firefox/14.0.1
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:15.0) Gecko/20120427 Firefox/15.0a1
Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0) Gecko/16.0 Firefox/16.0
Mozilla/5.0 (Windows NT 6.2; rv:19.0) Gecko/20121129 Firefox/19.0
Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0
iTunes/9.0.2 (Windows; N)
Mozilla/5.0 (compatible; Konqueror/4.5; Windows) KHTML/4.5.4 (like Gecko)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; Maxthon 2.0)
Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.1 (KHTML, like Gecko) Maxthon/3.0.8.2 Safari/533.1
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML like Gecko) Maxthon/4.0.0.2000 Chrome/22.0.1229.79 Safari/537.1
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.2; WOW64; Trident/5.0)
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)
Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; Trident/6.0)
Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0
Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.3; Trident/7.0; .NET4.0E; .NET4.0C)
Opera/9.80 (Windows NT 6.1; U; en) Presto/2.7.62 Version/11.01
Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14
Opera/9.80 (Windows NT 6.1; WOW64) Presto/2.12.388 Version/12.16
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.12 Safari/537.36 OPR/14.0.1116.4
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.29 Safari/537.36 OPR/15.0.1147.24 (Edition Next)
Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.57 Safari/537.36 OPR/18.0.1284.49
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.76 Safari/537.36 OPR/19.0.1326.56
Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.17.8 (KHTML, like Gecko) Version/5.0.1 Safari/533.17.8
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.2 Safari/533.18.5
Mozilla/5.0 (Windows; U; Windows NT 6.2; es-US ) AppleWebKit/540.0 (KHTML like Gecko) Version/6.0 Safari/8900.00
Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB; rv:1.9.1.17) Gecko/20110123 (like Firefox/3.x) SeaMonkey/2.0.12		
Mozilla/5.0 (Windows NT 5.2; rv:10.0.1) Gecko/20100101 Firefox/10.0.1 SeaMonkey/2.7.1
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20120422 Firefox/12.0 SeaMonkey/2.9'''.split('\n')

def get_page_text(url, useragent, sleep_timings = [2, 3, 5, 8], exception_timings = [5, 10, 15]):
    """
    Given a URL, return the text content
    
    Input:
        url: a string representing a URL
        useragent: our totally not fake id, officer
        sleep_timings: list of possible numbers of seconds to wait between requests
        exception_timings: list of possible numbers of seconds to wait between exceptions before retrying
    Output:
        the content of said URL
    """
    
    # Construct the header
    headers = {"Connection": "close", "user-agent": useragent}
    
    # Request until we have a result
    page = ""
    while (page == ""):
        try:
            time.sleep(random.choice(sleep_timings))
            page = requests.get(url)
        except:
            print("Unexpected error:", sys.exc_info()[0])
            time.sleep(random.choice(exception_timings))
            continue
    
    # Keep the page text
    return page.text

def write_page_text(url, text):
    """
    Write a page's text content to a file
    
    Input:
        url: a string representing the source of the text
        text: the text content
    Output:
        the filename under which the content was written
    """
    filename = r'C:\Users\jcraver\Desktop\BENCHMARKS\%s.html' % url.split('dno=')[-1]
    with open(filename, "w") as text_file:
        print(text, file=text_file)
    return filename

links = set([])
folder = r'C:\Users\jcraver\Desktop\BENCHMARKS'

with open("urls_todo.txt", "r") as text_file:
    for line in text_file:
        links.add(line.strip())

# Get files that have already been done
done = set([])
for file in os.listdir(folder):
    done.add(file)

# Starting from where we left off, pull down pages and write them
# This is to limit what we do at once (if desired)
countdown = 1000
processed = []
for link in links:
    processed.append(link)
    # Don't download a file we already have
    if (link.split('dno=')[-1] + '.html') in done:
        continue
    write_page_text(link, get_page_text(link, random.choice(useragents), [3, 5, 8, 13], [10, 20, 30]))
    countdown -= 1
    #print(link)
    if countdown <= 0:
        break

# Write down what we've done
for link in processed:
    links.remove(link)
with open("urls_todo.txt", "w") as text_file:
    for link in links:
        print(link, file=text_file)

This reads in the HTML files from the hard drive (not the web). This will need some work to read the docket entries, but that work can all be done within the scrape_page method; further web queries are probably unnecessary.

In [3]:
import re
import os
from bs4 import BeautifulSoup

def scrape_mac_page(filename):
    """
    Open the MA Appellate Court html file and Soup it as beautifully as possible
    
    Input:
        filename: The filename to parse
    Output:
        A dictionary of the items found in the case page
        A list of dated docket entries
    """
    soup = BeautifulSoup(open(filename), 'html.parser')
    info = {}
    
    # Get case tags
    header = soup.find('td', class_="largefont")
    if len(list(soup.find_all("td", align="center"))) < 2:
        return {}, []
    center_cells = list(soup.find_all("td", align="center")[1].stripped_strings)
    info["Court Type"] = header.b.contents[0]
    info["Panel"] = header.b.contents[1].text
    info["Case Name"] = center_cells[0]
    info["Case Id"] = center_cells[-1]
    
    # Get court tags
    tables = soup.find_all("table", class_="lightborder")
    attr_table = tables[0]
    for row in attr_table.find_all("tr", valign="top"): 
        items = row.find_all("b")
        for item in items:
            k = item.text
            v = item.next.next.text.strip()
            info[k] = v
    
    # Get parties
    parties_table = soup.find("table", class_="lightborder", cellpadding="5")
    if parties_table is None:
        return {}, []
    p_k = set([])
    for row in parties_table.find_all("tr")[1:]:
        k = row.b.nextSibling.next.strip().split('/')[0]
        v = row.b.text.strip()

        if k in info:
            info[k].append(v)
        else: 
            info[k] = [v]
            p_k.add(k)
    for k in p_k:
        info[k] = ", ".join(info[k])
    
    # Get docket entries
    docket = []
    if len(tables) >= 3:
        docket_table = tables[2]
        for row in docket_table.find_all("tr")[1:]:
            items = row.find_all("td")
            date = items[0].text.strip()
            entry = ""
            if len(items) >= 3:
                entry = str(items[2].text.strip())
                entry = re.sub(r"\s+", " ", entry, flags=re.UNICODE)
            docket.append([info['Case Id'], date, entry])
    
    return info, docket

base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="
folder = r'C:\Users\jcraver\Desktop\BENCHMARKS'
cases = []
dockets = []
keys = set([])

# Read in all the downloaded pages and print / process them
for file in os.listdir(folder):
    if file.endswith(".html"):
        fullname = os.path.join(folder, file)
        case, docket = scrape_mac_page(fullname)
        case['URL'] = base + file
        keys.update(case.keys())
        cases.append(case)
        dockets += docket

In [4]:
import unicodecsv as csv

print(keys)

# Write out the case csv
with open('cases.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(cases)

# Write out the case csv
with open('dockets.csv', 'wb') as output_file:
    writer = csv.writer(output_file)
    writer.writerow(['Case ID', 'Date', 'Entry'])
    for entry in dockets:
        writer.writerow(entry)

{'Pro Se Third-party defendant', 'Route to SJC', 'Court Type', 'Clerk for Commonwealth', 'Respondent', 'Pro Se Guardian ad litem', 'Out-of-state counsel', 'Pro Se Other', 'CPCS administrator', 'TC Number', 'cmt', 'Out of state counsel', 'Minor', 'Amicus (plaintiff)', 'Student appearing under SJC Rule 3:03', 'amid', 'Nature', 'Other Appellee', 'Out-of-state attorney', 'e3pp', 'Pro Se Respondent', 'Amicus', 'Third-party Defendant', 'Pro Se Amicus (plaintiff)', 'Massacusetts Correctional Institution', 'Nominal Party', 'Out-of-state counsel for defendant', 'Invited to file amicus brief', 'Pro Se Third-party plaintiff', 'Single Justice', 'Pro Se poth', 'Massachusetts attorney pending BBO number', 'Pro Se pet', 'clk', 'Argued/Submitted', 'Student Attorney purs to SJC 3:03', 'Receiver', 'DAR/FAR Number', 'Pro Se Amicus', '(Lower Court: jury)', 'Plaintiff', 'Third-party defendant', 'Panel', '3pp', 'rplf', 'TC Entry Date', '(Lower Court: Jury)', 'scom', 'Pro Se Defendant', 'amip', 'Out-of-state

These are some labeling and graphing utilities I developed

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "nobody", "none", "noone",
    "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"])

def extract_groupings(X, corpus, k, labels):
    """
    Return data points and metadata grouped by cluster
    
    Input:
        X = data points
        corpus = the reviews for each restaurant
        k = number of clusters
        labels = cluster number of each point in X
    Output:
        Two list of lists, where the ith sublist is all data points (or title) in the ith cluster
    """
    # Grouping of points by cluster and metadata by cluster
    groupings = [[] for i in range(k)]
    corpora = [[] for i in range(k)]
    
    # Sort each point (and associated metadata) into bins for each cluster label
    for i in range(len(X)):
        label = labels[i]
        
        # Add this point to the cluster based on its label
        groupings[label].append(X[i])
        corpora[label].append(corpus[i])
    
    # Score the value of each term within the groupings and get the most meaningful terms for each cluster
    titles = ['' for i in range(k)]
    for i in range(k):
        tiv = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, min_df=(0.01 * k), ngram_range=(1, 3))
        tiv.fit(corpora[i])
        indices = np.argsort(tiv.idf_)[::-1]
        titles[i] = [tiv.get_feature_names()[i] for i in indices[:10]]
    
    return groupings, titles

def plot_clustering(k, groupings, labels, title):
    """
    Plot the given k clusters on a 16x16 plot.
    
    Input:
        k = the number of clusters
        groupings = list of lists corresponding to the points in each cluster
        labels = the title of each cluster
        title = the title of the plot
    Output:
        None
    """
    # This size seems quite reasonable/readable
    plt.figure(figsize=(16, 16))
    plt.axes().set_aspect('equal')
    
    # Store the plot results so we can label them later
    legend = []
    
    # Plot each cluster
    for i in range(k):
        plot = plt.scatter([entry[1] for entry in groupings[i]], [entry[0] for entry in groupings[i]], alpha=0.5)#, color=colors[i], marker=markers[i], s=8)
        legend.append(plot)
    
    # Label each cluster
    plt.legend(legend, labels, fancybox=True, framealpha=0.5)
    plt.title(title)
    
    # Show the plot
    plt.show()

This would be where we put code to analyze/cluster our ma-appellatecourts.org data

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import mixture
from sklearn import metrics
import scipy.cluster.hierarchy as hierarchy
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pandas
from collections import Counter

%matplotlib inline

corpus_dockets = [re.sub(r'\d', '', case['Docket Entries']) for case in cases if 'Docket Entries' in case and len(case['Docket Entries']) > 0 and 'Case Type' in case and case['Case Type'] == 'Criminal' and (case['Case Status'].find('Rescript') >= 0) and case['Docket Entries'].lower().find('reversed') >= 0]   

# Plot histogram of lower court judges
#judges = []
#for case in cases:
#    if 'Docket Entries' in case and len(case['Docket Entries']) > 0 and 'Case Type' in case and case['Case Type'] == 'Criminal' and (case['Case Status'].find('Rescript') >= 0):
#        if 'Lower Ct Judge' in case and len(case['Lower Ct Judge']) > 0:
#            judges.append(case['Lower Ct Judge'])
#judge_counts = Counter(judges)
#common_judges = [[j[0]] * j[1] for j in judge_counts.most_common(20)]
#common_judges = [x for y in common_judges for x in y]
#common_judge_counts = Counter(common_judges)
#df = pandas.DataFrame.from_dict(common_judge_counts, orient='index')
#df.plot(kind='bar')

# Compute the tf-idf scores of the opinions
# Using up to trigrams to account for adverbs and for legal terms
fe_tfv = TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, min_df=0.01, max_df = 0.5, ngram_range=(1, 3))
tfidf_dockets = fe_tfv.fit_transform(corpus_dockets)

# Compute the LSA of the scored opinions
# After 4 components, we don't get much more ROI (plot leading to this conclusion is commented out below)
# min/max document frequency play a huge role here
n_c = 6
dc_tsvd = TruncatedSVD(n_components=n_c)
lsa_dockets = dc_tsvd.fit_transform(tfidf_dockets)

# This is the code we would use to graph the singular values
#fe_tfv = TfidfVectorizer(stop_words='english', min_df = 0.01, max_df = 0.5)
#tfidf_dockets = fe_tfv.fit_transform(corpus_dockets)
#dc_tsvd = TruncatedSVD(n_components=50)
#lsa_dockets = dc_tsvd.fit_transform(tfidf_dockets)
#plt.plot(range(1,51), dc_tsvd.singular_values_)
#print(dc_tsvd.singular_values_)

# Defining terms for each component
#terms = fe_tfv.get_feature_names()
#for i in range(n_c):
#    top = np.argsort(dc_tsvd.components_[i])
#    topterms = [terms[top[f]] for f in range(25)]
#    print (i, topterms)

# Cluster via GMM
# Best silhouette score is found with k=3, with 5 also being a local maximum
# Plot leading to this conclusion commented out below
#k = 5
#gmm_dockets = mixture.GaussianMixture(n_components=k).fit(lsa_dockets)
#groupings_dockets, titles_dockets = extract_groupings(lsa_dockets, corpus_dockets, k, gmm_dockets.predict(lsa_dockets))
#plot_clustering(k, groupings_dockets, titles_dockets, 'Docket Entries 2015-18')

# Cluster via cosine/hiearchical
# Best silhouette score is found with k=3, with 5 also being a local maximum
# Plot leading to this conclusion commented out below
k = 4
linkage_dockets = hierarchy.linkage(lsa_dockets, "average", metric="cosine")
hier_dockets = hierarchy.fcluster(linkage_dockets, k, criterion='maxclust') - 1
groupings_dockets, titles_dockets = extract_groupings(lsa_dockets, corpus_dockets, k, hier_dockets)
plot_clustering(k, groupings_dockets, titles_dockets, 'Cosine Similarity of Docket Entries')

# Without labeling, we may graph this way
#_ = plt.scatter(lsa_dockets[:,0], lsa_dockets[:,1], c=gmm_dockets.predict(lsa_dockets))

# This is the code we would use to display silhouette scores per k for GMM
#x = list(range(2,15))
#y = []
#for i in x:
#    gmm = mixture.GaussianMixture(n_components=i).fit(lsa_dockets)
#    y.append(metrics.silhouette_score(lsa_dockets, gmm.predict(lsa_dockets)))
#_ = plt.plot(x, y)

# This is the code we would use to display silhouette scores per k for cosine
#x = list(range(2,15))
#y = []
#linkage_dockets = hierarchy.linkage(lsa_dockets, "average", metric="cosine")
#for i in x:
#    hier = hierarchy.fcluster(linkage_dockets, i, criterion='maxclust') - 1
#    y.append(metrics.silhouette_score(lsa_dockets, hier))
#_ = plt.plot(x, y)

In [None]:
print([re.sub(r'\d', '', case['Docket Entries']).split('%%%')[-2] for case in cases if 'Docket Entries' in case and len(case['Docket Entries']) > 0 and 'Case Type' in case and case['Case Type'] == 'Criminal' and (case['Case Status'].find('Rescript') >= 0) and case['Docket Entries'].lower().find('verdict') >= 0]) 

This would be where we put code to analyze/cluster our Lexis opinions

In [None]:
import re
import os
from bs4 import BeautifulSoup

def scrape_lexis_page(filename):
    """
    Open the Lexis html file and Soup it as beautifully as possible
    
    Input:
        filename: The filename to parse
    Output:
        A dictionary of the items found in the case page
    """
    soup = BeautifulSoup(open(filename, 'rb'), 'html.parser')
    info = {}
    
    # Get document text
    doctext = soup.find("div", {"class": "document-text"})
    # TODO: Figure out why some documents return None from the previous step
    if not doctext:
        return {}
    
    # Parse metadata
    title = doctext.find("h1", {"id": "SS_DocumentTitle"}).text.strip()
    docinfo = doctext.find_all("p", {"class": "SS_DocumentInfo"})
    court = docinfo[0].text.strip()
    dates = docinfo[1].text.strip().split(';')
    case = docinfo[2].text.strip()
    info['Case Title'] = title
    info['Court'] = court
    # TODO: Fix date parsing
    #info['Date Argued'] = dates[0]
    #info['Date Decided'] = dates[1]
    info['Case Number'] = case
    reporter = []
    for sp in doctext.find_all("span", {"class": "SS_NonPaginatedRptr"}):
        reporter.append(sp.text.strip())
    # TODO: Get more from this section
    info['Reporter'] = ' | '.join(reporter)
    # TODO: Find out how to parse Prior History and similar (e.g. subsequent history)
    #prior = doctext.find_all("p", {"class": "SS_InlineText"})[-1].text
    #prior = re.sub(r"\s+", " ", prior, flags=re.UNICODE)
    #info['Prior History'] = prior
    
    
    #start = doctext.find("span", id="JUMPTO_Counsel")
    #for i in range(25):
    #    print(str(start.next_sibling).strip())
    #    start = start.next_sibling
    #here get stuff until br (end of category) and span (end of section)
    
    # TODO: Parse headnotes
    
    # Parse opinions
    info['Opinion Author'] = get_text_after_span(doctext, "JUMPTO_Opinionby")
    info['Opinion'] = get_text_after_id(doctext, "JUMPTO_Opinion")
    info['Concuring Opinion Author'] = get_text_after_span(doctext, "JUMPTO_Concurby")
    info['Concurring Opinion'] = get_text_after_id(doctext, "JUMPTO_Concur")
    info['Dissenting Opinion Author'] = get_text_after_span(doctext, "JUMPTO_Dissentby")
    info['Dissenting Opinion'] = get_text_after_id(doctext, "JUMPTO_Dissent")
    
    return info

def get_text_after_span(document, s_id):
    """
    Get the text immediately following some span with given id
    
    Input:
        document: The section of text
        s_id: The id of the span
    Output:
        The text immediately following said element (or the empty string if the id does not exist)
    """
    start = document.find("span", id=s_id)
    if not start:
        return ""
    return str(start.next_sibling).strip()

def get_text_after_id(document, e_id):
    """
    Get the text in the paragraphs immediately following some element with given id
    
    Input:
        document: The section of text
        e_id: The id of the span
    Output:
        The text immediately following said element (or the empty string if the id does not exist)
    """
    start = document.find(id=e_id)
    if not start:
        return ""
    element = start.next_sibling
    ps = []
    while element and element.name == 'p':
        text = element.text.strip()
        text = re.sub(r"\s+", " ", text, flags=re.UNICODE)
        ps.append(text)
        element = element.next_sibling
    return " %%% ".join(ps)

#base = "http://www.ma-appellatecourts.org/display_docket.php?src=party&dno="
folder = 'Reversal Opinions HTML'
cases = []
keys = set([])

# Read in all the downloaded pages and print / process them
for file in os.listdir(folder):
    if file.endswith(".html"):
        fullname = os.path.join(folder, file)
        case = scrape_lexis_page(fullname)
        keys.update(case.keys())
        cases.append(case)

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import mixture
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

corpus_opinions = [case['Opinion'] for case in cases if 'Opinion' in case]

# Compute the tf-idf scores of the opinions
fe_tfv = TfidfVectorizer(stop_words='english', min_df = 0.01, max_df = 0.5)
tfidf_opinions = fe_tfv.fit_transform(corpus_opinions)

# Compute the LSA of the scored opinions
# After 4 components, we don't get much more ROI (plot leading to this conclusion is commented out below)
# min/max document frequency play a huge role here
n_c = 4
dc_tsvd = TruncatedSVD(n_components=n_c)
lsa_opinions = dc_tsvd.fit_transform(tfidf_opinions)

# This is the code we would use to graph the singular values
#fe_tfv = TfidfVectorizer(stop_words='english', min_df = 0.01, max_df = 0.5)
#tfidf_opinions = fe_tfv.fit_transform(corpus_opinions)
#dc_tsvd = TruncatedSVD(n_components=50)
#lsa_opinions = dc_tsvd.fit_transform(tfidf_opinions)
#plt.plot(range(1,51), dc_tsvd.singular_values_)
#print(dc_tsvd.singular_values_)

# Defining terms for each component
#terms = fe_tfv.get_feature_names()
#for i in range(n_c):
#    top = np.argsort(dc_tsvd.components_[i])
#    topterms = [terms[top[f]] for f in range(25)]
#    print (i, topterms)

# Cluster via GMM
# Best silhouette score is found with k=2, and k=5 is the next local optimum
# Opting for 5 to obtain more semantic criteria for reversal
# Plot leading to this conclusion commented out below
gmm_opinions = mixture.GaussianMixture(n_components=5).fit(lsa_opinions)
_ = plt.scatter(lsa_opinions[:,0], lsa_opinions[:,1], c=gmm_opinions.predict(lsa_opinions))

# This is the code we would use to display silhouette scores per k
#x = list(range(2,15))
#y = []
#for i in x:
#    gmm = mixture.GaussianMixture(n_components=i).fit(lsa_opinions)
#    y.append(metrics.silhouette_score(lsa_opinions, gmm.predict(lsa_opinions)))
#plt.plot(x, y)

In [None]:
import unicodecsv as csv

print(keys)

# Write out the csv
with open('sjc-opinions.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(cases)