# Make Predictions

In [None]:
pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 6.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.8MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█████

In [None]:
import json
import pandas as pd
import numpy as np
from numpy import mean
from collections import Counter
import pickle
import re
import random

from google.colab import drive, files

import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast#, Trainer, TrainingArguments

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
device = 'cuda'

dataset_path = '/content/drive/MyDrive/SCOTUS/sentences.json'

In [None]:
# Option to include/omit per curiam opinions
remove_per_curiam = True

if remove_per_curiam:
  model_path = '/content/drive/MyDrive/SCOTUS/sc_model_distilbert_clean_sentences_non_percur'
  output_path = '/content/drive/MyDrive/SCOTUS/predictions_no_percur.p'
  combined_output_path = '/content/drive/MyDrive/SCOTUS/predictions_no_percur_df.p'
else:
  model_path = '/content/drive/MyDrive/SCOTUS/sc_model_distilbert_clean_sentences'
  output_path = '/content/drive/MyDrive/SCOTUS/predictions.p'
  combined_output_path = '/content/drive/MyDrive/SCOTUS/predictions_df.p'

# Clean Corpus Sentences
General cleaning of improperly processed text and remnants of SC opinion form (e.g. header, citations, titles). Remove sentences that are mostly numbers/punctuation/spaces (citations); if they have a lot of whitespace, remove elements between whitespace and after "Opinion of" (header). 

In [None]:
def clean_regex(df, column):
  
  df["text"] = df["text"].str.replace('\n', ' ')
  df["text"] = df["text"].replace('\s+', ' ', regex = True)
  df["text"] = df["text"].replace(r'\[','', regex=True) 
  df["text"] = df["text"].replace(r'\]','', regex=True)
  df["text"] = df["text"].replace(r'\- ','', regex=True)
  df["text"] = df["text"].replace(r'\xad','', regex=True)
  df["text"] = df["text"].replace(r'\'','', regex=True)
  df["text"] = df["text"].replace(r'\x97',',', regex=True)

  return df["text"]

In [None]:
# Keep only sentences above certain threshold of alphanumeric characters
def percent_text(text):
    char_dict = dict()
    char_dict["alpha_count"] = 0
    char_dict["total_count"] = 0

    for char in text:
        char_dict["total_count"] += 1
        if char.isalpha():
            char_dict["alpha_count"] += 1
    
    percent_letter = float(char_dict["alpha_count"]) / float(char_dict["total_count"]) * 100

    return percent_letter

In [None]:
def header_eraser(text):
    spaces = re.search(r'[ \t]{2,}', text)
    opinion = re.search(r'Opinion of', text)
    if spaces and opinion:
        # delete text between first space and opinion of + 20 char 
        result = re.sub('[ \t]{2,}.*?Opinion of[\s\S]{15}', '', text)
    else:
        result = text
    return result

In [None]:
def get_clean_data(dataset_path, device):
  opinion_js = json.load(open(dataset_path))
  df = pd.DataFrame.from_dict(opinion_js)

  # Clean sentences
  df["text"] = clean_regex(df, "text")

  # Get sentences with more letters
  df["percent_letter"] = df["text"].apply(percent_text)
  df = df[df["percent_letter"] > 50]

  # Remove header
  df["text"] = df["text"].apply(header_eraser)
  
  # Remove per_curiam if chosen
  if remove_per_curiam:
    df = df[df["category"]!= "per_curiam"]

  # Normalize dissenting category
  df.loc[(df.category == 'second_dissenting'),'category']='dissenting'

  # Prepare sentences for tokenization
  all_sentences = df["text"].to_list()

  return df, all_sentences

In [None]:
df, all_sentences = get_clean_data(dataset_path, device)

In [None]:
df.category.unique()

array(['majority', 'concurring', 'dissenting'], dtype=object)

# Predictions
Predict all sentences with trained, fine-tuned DistilBERT model.

In [None]:
worklist = all_sentences
batchsize = 16
predictions = []

model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




Iterate over the corpus in batches; do not run this code unless re-running predictions.

In [None]:
for i in range(0, len(worklist), batchsize):
    batch = worklist[i:i+batchsize]
    test_encodings = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device)
    output = model(**test_encodings)
    monologic_results = torch.softmax(output.logits, dim=1).tolist()
    predictions.append(monologic_results)
    if i % 100 == 0:
      print(str(i)+" in "+str(len(worklist)))

0 in 1515372
400 in 1515372
800 in 1515372
1200 in 1515372
1600 in 1515372
2000 in 1515372
2400 in 1515372
2800 in 1515372
3200 in 1515372
3600 in 1515372
4000 in 1515372
4400 in 1515372
4800 in 1515372
5200 in 1515372
5600 in 1515372
6000 in 1515372
6400 in 1515372
6800 in 1515372
7200 in 1515372
7600 in 1515372
8000 in 1515372
8400 in 1515372
8800 in 1515372
9200 in 1515372
9600 in 1515372
10000 in 1515372
10400 in 1515372
10800 in 1515372
11200 in 1515372
11600 in 1515372
12000 in 1515372
12400 in 1515372
12800 in 1515372
13200 in 1515372
13600 in 1515372
14000 in 1515372
14400 in 1515372
14800 in 1515372
15200 in 1515372
15600 in 1515372
16000 in 1515372
16400 in 1515372
16800 in 1515372
17200 in 1515372
17600 in 1515372
18000 in 1515372
18400 in 1515372
18800 in 1515372
19200 in 1515372
19600 in 1515372
20000 in 1515372
20400 in 1515372
20800 in 1515372
21200 in 1515372
21600 in 1515372
22000 in 1515372
22400 in 1515372
22800 in 1515372
23200 in 1515372
23600 in 1515372
24000 in 1

In [None]:
# Save predictions to output path
pickle.dump(predictions, open(output_path, "wb"))

From this point, move to clean_sents_predictions.ipynb in jupyter

In [None]:
flat_list = [item for sublist in predictions for item in sublist]
df["predictions"] = flat_list
df[['prob_0','prob_1']] = pd.DataFrame(df.predictions.tolist(), index=df.index)
df['monologic_prediction'] = np.where(df['prob_1'] > .50, 1, 0)

# Rename categories
df.loc[(df.category == 'majority'),'category']='Majority'
df.loc[(df.category == 'dissenting'),'category']='Dissenting'
df.loc[(df.category == 'concurring'),'category']='Concurring'
if remove_per_curiam == False:
  df.loc[(df.category == 'per_curiam'),'category']='Per Curiam'

# Remove bad names
wrong_names = ["Justice And", "Justice O2122", "Justice Or", "Justice Connor", "Justice Holmes", "Justice Fuller", "Justice Waite", "Justice Woods", "Justice McReynolds", "Justice Stone"]
df = df[~df['author'].isin(wrong_names)]

# Remove Justice White errors
df[df["author"] == "Justice White"].year.max()
index_names = df[(df['author'] == "Justice White") & (df['year'] == 2010)].index
df.drop(index_names, inplace = True)
index_names = df[(df['author'] == "Justice White") & (df['year'] == 2005)].index
df.drop(index_names, inplace = True)

# Add Chief Justice
conditions = [
    (df['year'] <= 1953),
    (df['year'] > 1953) & (df['year'] <= 1969),
    (df['year'] > 1969) & (df['year'] <= 1986),
    (df['year'] > 1986) & (df['year'] <= 2004),
    (df['year'] > 2004)
    ]

values = ["Vinson", "Warren", "Burger", "Rehnquist", "Roberts"]
df['chief_justice'] = np.select(conditions, values)


In [None]:
df.sample(10)

Unnamed: 0,opinion_num,category,author,case,year,text,sent_index,length,sent_location,percent_letter,predictions,prob_0,prob_1,monologic_prediction,chief_justice
173114,1690,Majority,Justice Douglas,"Askew v. American Waterways Operators, Inc.",1973,The fact that a whole system of liabilities wa...,177,188,0.941489,81.318681,"[0.2843386232852936, 0.7156614065170288]",0.284339,0.715661,1,Burger
1279562,11961,Dissenting,Justice Burton,United States v. Twin City Power Co.,1956,"The first ten amendments to the Constitution, ...",170,174,0.977011,81.73913,"[0.9959807395935059, 0.0040192087180912495]",0.995981,0.004019,0,Warren
627965,5961,Majority,Justice Thomas,Mitchell v. Helms,2000,"The Court of Appeals viewed this distinction, ...",51,405,0.125926,81.042654,"[0.9924142956733704, 0.00758577324450016]",0.992414,0.007586,0,Rehnquist
383219,3708,Majority,Justice Scalia,Branch v. Smith,2003,"736 (""until such State be redistricted as here...",217,316,0.686709,69.565217,"[0.9961373209953308, 0.003862607292830944]",0.996137,0.003863,0,Rehnquist
1095507,10256,Majority,Justice O'Connor,Ayotte v. Planned Parenthood of Northern New Eng.,2006,Only a few applications of New Hampshires pare...,102,113,0.902655,86.486486,"[0.9958237409591675, 0.0041762664914131165]",0.995824,0.004176,0,Roberts
114111,1113,Majority,Justice Thomas,Carcieri v. Salazar,2009,Cite as: 555 U. S. ____ (2009) 15 Opinion of t...,164,186,0.88172,65.6,"[0.9852756261825562, 0.014724351465702057]",0.985276,0.014724,0,Roberts
1278271,11951,Dissenting,Justice Whittaker,Boynton v. Virginia,1960,"In truth, the record does not even show the na...",19,51,0.372549,80.434783,"[0.9964197874069214, 0.0035802361089736223]",0.99642,0.00358,0,Warren
1166937,10943,Concurring,Justice Powell,Hishon v. King & Spalding,1984,I write to make clear my understanding that th...,3,17,0.176471,81.045752,"[0.010635418817400932, 0.9893646240234375]",0.010635,0.989365,1,Burger
1336609,12534,Majority,Justice Blackmun,National Assn. of Greeting Card Publishers v. ...,1983,27 *832 The Conference Committee abandoned the...,210,239,0.878661,80.851064,"[0.9939409494400024, 0.006059098523110151]",0.993941,0.006059,0,Burger
609521,5783,Dissenting,Justice Brennan,United States v. Brewster,1972,Part of the alleged conspiracy was a speech de...,81,225,0.36,82.307692,"[0.9957544803619385, 0.004245483782142401]",0.995754,0.004245,0,Burger


In [None]:
# Save that df
df.to_pickle(combined_output_path)