# Filter Dataset from oyez_gather script
This script takes in input oyez.json as generated by the oyez_gather script.
Here's an already generated dataset:

Minimal JSON compact form (216MB):
https://www.dropbox.com/s/9kyk0dr2gf3ls23/oyez.json?dl=0

Prettified JSON human-readable form (431 MB):
https://www.dropbox.com/s/52a58aac8iujupv/oyez_pretty.json?dl=0

## Imports

In [1]:
# !pip install json
# !pip install pandas
# !pip install numpy
# !pip install nltk
# !pip install bs4
# !pip install re
# !pip install contractions

import json
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup
import gensim
import contractions
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import torch

# Just for visuals
pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\smitp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Dataset Preparation

## Load the dataset

In [2]:
with open('oyez.json', 'r') as f:
    data = json.load(f)

## Filter the dataset to only the columns we need with cases with non-missing values

In [3]:
# Returns true if the given case's judgment has been decided, false otherwise
def is_decided(entry):
    for timeline in entry['timeline']:
        if timeline['event'] == "Decided":
            return True
    return False

# Returns true if the given case has the necessary columns for our analysis, false otherwise
# Currently only considers:
#    1- case's judgment has been decided
#    2- case has a non-empty facts field
#    3- case has a non-empty decisions field
def is_entry_complete(entry):
    try:
        if not is_decided(entry):
            return False
        facts = entry['facts_of_the_case']
        if facts == None or len(str(facts)) == 0:
            return False
        if entry['decisions'] == None or len(entry['decisions']) == 0:
            return False
        return True
    except:
        return False

# Returns a dict with only the necessary columns we need from a given case
# Currently the followings columns are considered:
#    1-  case ID (assigned by oyez.org)
#    2-  case name
#    3-  href URL to the oyez.org case
#    4-  term (read: year)
#    5-  name of the first party
#    6-  name of the second party
#    7-  facts of the case
#    8-  majority vote count
#    9-  minority vote count
#    10- name of the winning party
#    11- the decision type (see below for more detials)
#    12- the disposition type (see below for more details)
def filter_entry(entry):
    row = {}
    row['ID'] = entry['ID']
    row['name'] = entry['name']
    row['href'] = entry['href']
    row['term'] = entry['term']
    row['first_party'] = entry['first_party']
    row['second_party'] = entry['second_party']
    row['facts'] = entry['facts_of_the_case']
    
    # As a simplification, only consider the first decision of the case
    # Its rare, but there are cases with multiple decisions.
    # How should we handle those? At the moment, we ignore subsequent decisions
    decision = entry['decisions'][0]
    row['majority_vote'] = decision['majority_vote']
    row['minority_vote'] = decision['minority_vote']
    row['winning_party'] = decision['winning_party']
    row['decision_type'] = decision['decision_type']
    row['disposition'] = decision['disposition']
    return row

# Decision Type can be one the following:
#     majority - an opinion in a case that is shared by more than half of the members of a court 
#     per curiam -  said of a judicial opinion presented as that of the entire court rather than that of any one judge
#     plurality opinion - an opinion that received the most votes of any opinion but not enough to be the majority opinion
#     equally divided - ?
#     dismissal - moot - ?
#     dismissal - rule 46 - ?
#     dismissal - other - ?
#     dismissal - improvidently granted - ?
#     memorandum - ?

# Disposition can be a combination of the following:
#     affirmed - lower court judgment was correct
#     reversed - lower court judgment was incorrect
#     remanded - send case back to lower court
#     vacated - lower court judgment has been cancelled or rendered void

In [4]:
filtered_data = []
for entry in data:
    if is_entry_complete(entry):
        filtered_data.append(filter_entry(entry))

## Prepare Pandas DataFrame for the Dataset 

In [5]:
df = pd.DataFrame(filtered_data)
dfCopy = pd.DataFrame(filtered_data)

# Fill empty/None values with an empty string
df['winning_party'] = df['winning_party'].fillna('')
df['disposition'] = df['disposition'].fillna('')

# Remove rows with missing values in the listed columns
# df.dropna(inplace=True, subset=['first_party', 'second_party', 'facts'])

print(f'There are {len(df)} cases.')

There are 3303 cases.


In [6]:
display(df.head(n=3))

Unnamed: 0,ID,name,href,term,first_party,second_party,facts,majority_vote,minority_vote,winning_party,decision_type,disposition
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,1971,Jane Roe,Henry Wade,"<p>In 1970, Jane Roe (a fictional name used in court documents to protect the plaintiff’s identity) filed a lawsuit against Henry Wade, the district attorney of Dallas County, Texas, where she resided, challenging a Texas law making abortion illegal except by a doctor’s orders to save a woman’s life. In her lawsuit, Roe alleged that the state laws were unconstitutionally vague and abridged her right of personal privacy, protected by the First, Fourth, Fifth, Ninth, and Fourteenth Amendments.</p>\n",7,2,Jane Roe,majority opinion,reversed
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,1971,"Peter Stanley, Sr.",Illinois,"<p>Joan Stanley had three children with Peter Stanley. The Stanleys never married, but lived together off and on for 18 years. When Joan died, the State of Illinois took the children. Under Illinois law, unwed fathers were presumed unfit parents regardless of their actual fitness and their children became wards of the state. Peter appealed the decision, arguing that the Illinois law violated the Equal Protection Clause of the Fourteenth Amendment because unwed mothers were not deprived of their children without a showing that they were actually unfit parents. The Illinois Supreme Court rejected Stanley’s Equal Protection claim, holding that his actual fitness as a parent was irrelevant because he and the children’s mother were unmarried.</p>\n",5,2,Stanley,majority opinion,reversed/remanded
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,1971,John Giglio,United States,"<p>John Giglio was convicted of passing forged money orders. While his appeal to the U.S. Court of Appeals for the Second Circuit was pending, Giglio’s counsel discovered new evidence. The evidence indicated that the prosecution failed to disclose that it promised a key witness immunity from prosecution in exchange for testimony against Giglio. The district court denied Giglio’s motion for a new trial, finding that the error did not affect the verdict. The Court of Appeals affirmed.</p>\n",7,0,Giglio,majority opinion,reversed/remanded


# 2. Preprocess Dataset

## Statistics before Preprocessing

In [7]:
avg_before_preprocessing = df['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average facts character length (before preprocesesing): {avg_before_preprocessing:.0f}')

Average facts character length (before preprocesesing): 1112


## Lowercase

In [8]:
df['facts'] = df['facts'].str.lower()
dfCopy['facts'] = dfCopy['facts'].str.lower()

print(np.average(dfCopy['facts'].apply(lambda x: len(x.split()))))

172.37874659400546


## Remove HTML/URL

In [9]:
# Function to remove HTML tags and URLs from a string
def sanitize_review(text):
    # remove HTML tags
    text = BeautifulSoup(str(text), 'html.parser').get_text()   
    # remove URLS
    text = re.sub(r'http\S+', '', str(text))
    return text

df['facts'] = df['facts'].apply(sanitize_review)
dfCopy['facts'] = dfCopy['facts'].apply(sanitize_review)

## Remove Contractions

In [None]:
def fix_contractions(text):
    return contractions.fix(text)

df['facts'] = df['facts'].apply(fix_contractions)
dfCopy['facts'] = dfCopy['facts'].apply(fix_contractions)

## Remove non-alphabetical characters

In [None]:
df['facts'] = df['facts'].str.replace('[^a-zA-Z\s]', ' ')
dfCopy['facts'] = dfCopy['facts'].str.replace('[^a-zA-Z\s]', ' ')

## Remove extra spaces

In [None]:
def remove_extra_spaces(text):
    return ' '.join(str(text).split())

df['facts'] = df['facts'].apply(remove_extra_spaces)
dfCopy['facts'] = dfCopy['facts'].apply(remove_extra_spaces)

## Remove stop words

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

def remove_stop_words(text):
    return ' '.join([word for word in str(text).split() if word not in (stop)])

df['facts'] = df['facts'].apply(remove_stop_words)

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)])

df['facts'] = df['facts'].apply(lemmatize)

## Statistics after Preprocessing

In [15]:
avg_after_preprocessing = df['facts'].apply(lambda x: len(str(x))).mean()
print(f'Average facts character length (after preprocesesing): {avg_after_preprocessing:.0f}')

Average facts character length (after preprocesesing): 776


In [16]:
display(df.head(n=3))

Unnamed: 0,ID,name,href,term,first_party,second_party,facts,majority_vote,minority_vote,winning_party,decision_type,disposition
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,1971,Jane Roe,Henry Wade,jane roe fictional name used court document protect plaintiff identity filed lawsuit henry wade district attorney dallas county texas resided challenging texas law making abortion illegal except doctor order save woman life lawsuit roe alleged state law unconstitutionally vague abridged right personal privacy protected first fourth fifth ninth fourteenth amendment,7,2,Jane Roe,majority opinion,reversed
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,1971,"Peter Stanley, Sr.",Illinois,joan stanley three child peter stanley stanley never married lived together year joan died state illinois took child illinois law unwed father presumed unfit parent regardless actual fitness child became ward state peter appealed decision arguing illinois law violated equal protection clause fourteenth amendment unwed mother deprived child without showing actually unfit parent illinois supreme court rejected stanley equal protection claim holding actual fitness parent irrelevant child mother unmarried,5,2,Stanley,majority opinion,reversed/remanded
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,1971,John Giglio,United States,john giglio convicted passing forged money order appeal court appeal second circuit pending giglio counsel discovered new evidence evidence indicated prosecution failed disclose promised key witness immunity prosecution exchange testimony giglio district court denied giglio motion new trial finding error affect verdict court appeal affirmed,7,0,Giglio,majority opinion,reversed/remanded


# 3. How controversial is a case?
Based on the votes distribution and judge opinions, determine how controversial or disputed a case is.

In [17]:
# Extract only the columns we need
df2 = df[['ID', 'href', 'name', 'facts', 'majority_vote', 'minority_vote']].copy()

# Remove rows with no majority_vote or no minority_vote
df2.dropna(subset=['majority_vote', 'minority_vote'], inplace=True)

# Given a row, return the controversial class, or -1 if its an invalid row (i.e. 0, 0 for majority/minority votes)
'''
    Returns:
        -1 - invalid row
         0 - not controversial
         1 - controversial
'''
def label_row(row):
    majority = row['majority_vote']
    minority = row['minority_vote']
    total = majority + minority
    if total == 0:
        return -1
    else:
        controversial = 1 - majority / total
        # The threshold is 0.3 rather than 0.5
        # You may be tempted to set this to 0.5, but only ~16 cases have controversial value of [0.5, 1.0]
        if controversial <= 0.3:
            return 0
        else:
            return 1

# Create a new column for the controversial class for each row
df2['controversial'] = df2.apply(lambda row: label_row(row), axis=1)

# Remove rows with invalid controversial class
df2 = df2[df2['controversial'] != -1]

# Remove unneeded columns
df2.drop(['majority_vote', 'minority_vote'], axis=1, inplace=True)

In [18]:
display(df2.head(n=3))

Unnamed: 0,ID,href,name,facts,controversial
0,50606,https://api.oyez.org/cases/1971/70-18,Roe v. Wade,jane roe fictional name used court document protect plaintiff identity filed lawsuit henry wade district attorney dallas county texas resided challenging texas law making abortion illegal except doctor order save woman life lawsuit roe alleged state law unconstitutionally vague abridged right personal privacy protected first fourth fifth ninth fourteenth amendment,0
1,50613,https://api.oyez.org/cases/1971/70-5014,Stanley v. Illinois,joan stanley three child peter stanley stanley never married lived together year joan died state illinois took child illinois law unwed father presumed unfit parent regardless actual fitness child became ward state peter appealed decision arguing illinois law violated equal protection clause fourteenth amendment unwed mother deprived child without showing actually unfit parent illinois supreme court rejected stanley equal protection claim holding actual fitness parent irrelevant child mother unmarried,0
2,50623,https://api.oyez.org/cases/1971/70-29,Giglio v. United States,john giglio convicted passing forged money order appeal court appeal second circuit pending giglio counsel discovered new evidence evidence indicated prosecution failed disclose promised key witness immunity prosecution exchange testimony giglio district court denied giglio motion new trial finding error affect verdict court appeal affirmed,0


## Display frequency of each controversial class

In [19]:
index, counts = np.unique(df2['controversial'].values, return_counts=True)
print(index) # classes
print(counts) # frequency

[0 1]
[1976 1241]


In [20]:
display(dfCopy.head(n=3))

Unnamed: 0,ID,name,href,term,first_party,second_party,facts,majority_vote,minority_vote,winning_party,decision_type,disposition
0,50606,Roe v. Wade,https://api.oyez.org/cases/1971/70-18,1971,Jane Roe,Henry Wade,in jane roe a fictional name used in court documents to protect the plaintiff s identity filed a lawsuit against henry wade the district attorney of dallas county texas where she resided challenging a texas law making abortion illegal except by a doctor s orders to save a woman s life in her lawsuit roe alleged that the state laws were unconstitutionally vague and abridged her right of personal privacy protected by the first fourth fifth ninth and fourteenth amendments,7,2,Jane Roe,majority opinion,reversed
1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,1971,"Peter Stanley, Sr.",Illinois,joan stanley had three children with peter stanley the stanleys never married but lived together off and on for years when joan died the state of illinois took the children under illinois law unwed fathers were presumed unfit parents regardless of their actual fitness and their children became wards of the state peter appealed the decision arguing that the illinois law violated the equal protection clause of the fourteenth amendment because unwed mothers were not deprived of their children without a showing that they were actually unfit parents the illinois supreme court rejected stanley s equal protection claim holding that his actual fitness as a parent was irrelevant because he and the children s mother were unmarried,5,2,Stanley,majority opinion,reversed/remanded
2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,1971,John Giglio,United States,john giglio was convicted of passing forged money orders while his appeal to the you s court of appeals for the second circuit was pending giglio s counsel discovered new evidence the evidence indicated that the prosecution failed to disclose that it promised a key witness immunity from prosecution in exchange for testimony against giglio the district court denied giglio s motion for a new trial finding that the error did not affect the verdict the court of appeals affirmed,7,0,Giglio,majority opinion,reversed/remanded


### Train Word2Vec Model

In [21]:
word2vec = gensim.models.Word2Vec(sentences = dfCopy['facts'].apply(lambda x: x.split()), vector_size=100, window=11)


In [22]:
def calculateAvgVector(review):
    avgVec = np.zeros(100)
    num = 0
    for word in review:
        if word in word2vec.wv:
            avgVec += word2vec.wv[word]
            num += 1
    avgVec /= num
    return avgVec


df2['avgWord2Vec'] = df['facts'].apply(calculateAvgVector)

display(df2.head(n=1))

Unnamed: 0,ID,href,name,facts,controversial,avgWord2Vec
0,50606,https://api.oyez.org/cases/1971/70-18,Roe v. Wade,jane roe fictional name used court document protect plaintiff identity filed lawsuit henry wade district attorney dallas county texas resided challenging texas law making abortion illegal except doctor order save woman life lawsuit roe alleged state law unconstitutionally vague abridged right personal privacy protected first fourth fifth ninth fourteenth amendment,0,"[-0.2147746344655752, 0.1555396685997645, 0.37713931924353045, 0.27548077730617176, 0.2836545525351539, -0.002906513847410679, 0.420634981294473, 0.26051431926433, -0.11545287030438582, 0.2326540336990729, 0.11856872833644351, -0.2924590972562631, 0.33246960045769813, 0.5163163010776043, -0.3086835518262039, 0.059263788672784964, -0.05673542956278349, -0.15400740347802638, -0.027797024132062993, -0.6733945586780707, 0.19539391488729355, 0.010653671749557059, 0.2179606451342503, -0.11485163840154806, 0.09168348658209045, 0.00015018497904141745, 0.1419217615785601, -0.11161981110305835, -0.8370522925754388, -0.25504131921799855, 0.06922466423362493, 0.005723467571660876, -0.05825177914307763, -0.3733747008939584, 0.10120032807812095, -0.1505576040269807, -0.0009496871226777633, -0.021931776873146493, -0.22414211375017962, -0.7134683486322562, -0.02965872305445373, -0.2591561682522297, -0.1708799807416896, 0.31113021649420264, 0.18806260467196503, -0.3773106664450218, -0.43701099207003913, 0.161124746829737, 0.19853670397152504, 0.1384008579258807, -0.12626477303138622, 0.1128284882629911, 0.08823897570371628, 0.14573298244737087, -0.15123370630084537, -0.10183177204181751, 0.11699323664108913, -0.2152861072557668, -0.4109185595600866, 0.04488039909861982, 0.29730659667402504, -0.02103337547198559, 0.10366517174988985, -0.10990773168824186, -0.6709972395002842, 0.4728614409516255, -0.2619458982363964, 0.1063814707348744, -0.5414489042262236, 0.019532115377951412, -0.5057210169235865, 0.5179418066889048, 0.19314716504265864, -0.14546642091746131, 0.45177034541964534, -0.11592952281236649, 0.4537292083601157, 0.03459292645178114, -0.4578340260932843, -0.1954996273604532, -0.0006656653185685476, -0.05180199009055893, -0.009347545423855384, 0.5512251868844033, -0.19694734343638023, -0.00985147464205511, 0.2168656200590582, -0.0035145952738821506, 0.2092997016128114, 0.4424186132242903, -0.21791437707996617, 0.13925800899975002, 0.4881681837389866, 0.1291978319020321, 0.6021646431088448, 0.15163587469607592, -0.014843907452498873, 0.10354407933695862, -0.04580927594875296, 0.3084240119997412]"


### Test Basic Linear Classifier

In [23]:
svc = LinearSVC(random_state=0)
svc.fit(df2['avgWord2Vec'].values.tolist(), df2['controversial'].values.tolist())


LinearSVC(random_state=0)

In [24]:
predictions = svc.predict(df2['avgWord2Vec'].values.tolist())
counts = {}
for item in predictions:
    if item not in counts:
        counts[item] = 1
    else:
        counts[item] += 1

print(accuracy_score(predictions, df2['controversial'].values.tolist()))
print(counts)

0.6151694124961143
{0: 3214, 1: 3}


In [25]:
maxFactLen = max(dfCopy['facts'].apply(lambda x: len(x.split(" "))))

In [26]:
def concatWord2Vec(item):
    if len(item.split()) >= maxFactLen:
        print(len(item.split(" "))) 
    vecList = []
    for word in item.split():
        if word in word2vec.wv:
            vecList.append(word2vec.wv[word])
    while len(vecList) < maxFactLen:
        vecList.append([0 for i in range(100)])
    if len(vecList) != 952:
        print(len(vecList))
    return vecList

df2['concatWord2Vec'] = dfCopy['facts'].apply(concatWord2Vec)

952


In [27]:
class BLSTM(torch.nn.Module):
    def __init__(self):
        super(BLSTM, self).__init__()
        self.lstm = torch.nn.LSTM(input_size=100,hidden_size=256,num_layers=1,batch_first=True,dropout=0.33, bidirectional=True).cuda()
        self.linear = torch.nn.Linear(512, 128).cuda()
        self.classify = torch.nn.Linear(128, 2).cuda()
        self.elu = torch.nn.ELU()
        self.softmax = torch.nn.Softmax(dim=2)
        
    def forward(self, x):
        initHidden = torch.zeros(2, x.size(0), 256).type(torch.cuda.FloatTensor)
        initInternal = torch.zeros(2, x.size(0), 256).type(torch.cuda.FloatTensor)
        initialOutput, hidden = self.lstm(x.float(), (initHidden, initInternal))
        out = self.linear(initialOutput)
        out = self.elu(out)
        out = self.classify(out)
        return out

In [28]:
finalLabels = [[df2['controversial'].values.tolist()[i] for j in range(952)] for i in range(len(df2['controversial'].values.tolist()))]

trainingVectors = torch.as_tensor(df2['concatWord2Vec'].values.tolist()).cuda()
trainingLabels = torch.flatten(torch.as_tensor(finalLabels)).cuda()



In [29]:
display(df2['controversial'].head(n=3))

0    0
1    0
2    0
Name: controversial, dtype: int64

In [30]:
blstmModel = BLSTM()
torch.cuda.empty_cache()

def initWeight(m):
    if isinstance(m, torch.nn.Linear) or isinstance(m, torch.nn.LSTM):
        torch.nn.init.uniform_(m.weight)
        torch.nn.init.ones_(m.bias)
        
initWeight(blstmModel)

optimizer = torch.optim.Adam(blstmModel.parameters(), lr=0.01)
lossFn = torch.nn.CrossEntropyLoss()

#Needed to use only a subset of the training data due to limited GPU memory availability
numItems = int(trainingVectors.size(0)/12)
for i in range(500):
    optimizer.zero_grad()
    #tempOut=torch.as_tensor([item[-1] for item in blstmModel.forward(trainingVectors[:numItems]).tolist()]).cuda()
    tempOut = blstmModel.forward(trainingVectors[:numItems])
    out = torch.flatten(tempOut, start_dim=0, end_dim=1)
    #print(tempOut)
    #print(tempOut.shape)
#     print(out.shape)
#     print(trainingLabels[:numItems].shape)
    loss = lossFn(out, trainingLabels[:numItems*952].long())
    if i%10 == 0:
        print(loss) 
    loss.backward()
    optimizer.step()



tensor(0.6889, device='cuda:0', grad_fn=<NllLossBackward>)


RuntimeError: CUDA out of memory. Tried to allocate 1.47 GiB (GPU 0; 8.00 GiB total capacity; 5.26 GiB already allocated; 468.12 MiB free; 5.68 GiB reserved in total by PyTorch)