In [1]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 40.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 51.1 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 73.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [2]:
pip install pickle5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 32.1 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [3]:
import json
import pandas as pd
import numpy as np
from numpy import mean
from collections import Counter
import pickle
import re
import random
import os

from google.colab import drive, files

import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [4]:
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
device = 'cuda'

scotus_dir = '/content/drive/MyDrive/SCOTUS/'

model_path = os.path.join(scotus_dir, 'models', 'mono_types_bert') # where the fine-tuned model is saved

# INPUT DATA
input_path = os.path.join(scotus_dir, 'data', 'combined_output')

# OUTPUT
output_path = os.path.join(scotus_dir, 'output', 'mono_types_bert') # main output dir
raw_output_path = os.path.join(output_path, 'raw_output_indi_coll') # where to save raw predictions
combined_output_path = os.path.join(output_path, 'combined_output_indi_coll') # where to save combined predictions labels and text
combined_csv_path = os.path.join(output_path, 'combined_output_indi_coll.csv') # where to save main output doc

In [6]:
with open(input_path, "rb") as fh:
  df = pickle.load(fh)

In [7]:
df["key"] = range(1, len(df.index)+1)

In [8]:
monologic_df = df[df["monologic_prediction"] == 1]

### Prepare Data

In [9]:
remove_per_curiam = False

In [10]:
def clean_regex(df, column):
  
  df["text"] = df["text"].str.replace('\n', ' ')
  df["text"] = df["text"].replace('\s+', ' ', regex = True)
  df["text"] = df["text"].replace(r'\[','', regex=True) 
  df["text"] = df["text"].replace(r'\]','', regex=True)
  df["text"] = df["text"].replace(r'\- ','', regex=True)
  df["text"] = df["text"].replace(r'\xad','', regex=True)
  df["text"] = df["text"].replace(r'\'','', regex=True)
  df["text"] = df["text"].replace(r'\x97',',', regex=True)

  return df["text"]

In [11]:
# Keep only sentences above certain threshold of alphanumeric characters
def percent_text(text):
    char_dict = dict()
    char_dict["alpha_count"] = 0
    char_dict["total_count"] = 0

    for char in text:
        char_dict["total_count"] += 1
        if char.isalpha():
            char_dict["alpha_count"] += 1
    
    percent_letter = float(char_dict["alpha_count"]) / float(char_dict["total_count"]) * 100

    return percent_letter

In [12]:
def header_eraser(text):
    spaces = re.search(r'[ \t]{2,}', text)
    opinion = re.search(r'Opinion of', text)
    if spaces and opinion:
        # delete text between first space and opinion of + 20 char 
        result = re.sub('[ \t]{2,}.*?Opinion of[\s\S]{15}', '', text)
    else:
        result = text
    return result

In [13]:
def get_clean_data(df, device):

  # Clean sentences
  df["text"] = clean_regex(df, "text")

  # Get sentences with more letters
  df["percent_letter"] = df["text"].apply(percent_text)
  df = df[df["percent_letter"] > 50]

  # Remove header
  df["text"] = df["text"].apply(header_eraser)
  
  # Remove per_curiam if chosen
  if remove_per_curiam:
    df = df[df["category"]!= "per_curiam"]

  # Normalize dissenting category
  df.loc[(df.category == 'second_dissenting'),'category']='dissenting'

  # Prepare sentences for tokenization
  all_sentences = df["text"].to_list()

  return df, all_sentences

In [14]:
monologic_df, worklist = get_clean_data(monologic_df, device)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[r

In [15]:
worklist[:10]

['Today, however, the Court permits exactly that.',
 'The Court does so by reading one statutory provision in isolation while giving short shrift to the statutory context, the relationships between the provisions at issue, and the framework set forth in precedent.',
 'The Court’s holding is inconsistent with the structure of the Medicaid program and will cause needless unfairness and disruption.',
 'I respectfully dissent.',
 'I Congress conditions a State’s receipt of federal Medicaid funding, see 42 U. S. C. §1396d(b), on compliance with federal requirements for the program.',
 'This Court’s task is to interpret these provisions “ ‘as a symmetrical and coherent regulatory scheme’ ” while “ ‘fitting .',
 'The Court commits several errors on the path to its holding, which departs from the statutory scheme as understood in Ahlborn and forces the Court to adopt an implausible workaround in order to mitigate the absurd consequence, discussed above, of its acontextual reading.',
 'A The Co

### Make Predictions

In [16]:
# choose batchsize --> reduce this if out of GPUs
batchsize = 8
predictions = []

# load the fine-tuned model from our directory and send it to cuda
model = DistilBertForSequenceClassification.from_pretrained(model_path).to(device)

# load the tokenizer (make sure this is the same type of tokenizer as what we used when training)
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [17]:
# get predictions in batches

for i in range(0, len(worklist), batchsize):
    batch = worklist[i:i+batchsize] # extract batch from worklist
    test_encodings = tokenizer(batch, truncation=True, padding=True, return_tensors="pt").to(device) # tokenize the posts
    output = model(**test_encodings) # make predictions with model on our test_encodings for this batch
    batch_predictions = torch.softmax(output.logits, dim=1).tolist() # get the predictions result
    predictions.append(batch_predictions)
    if i % 10000 == 0:
      print("Processing "+ str(round(i/len(worklist), 2)*100)+ "% complete")

Processing 0.0% complete
Processing 4.0% complete
Processing 8.0% complete
Processing 12.0% complete
Processing 16.0% complete
Processing 19.0% complete
Processing 23.0% complete
Processing 27.0% complete
Processing 31.0% complete
Processing 35.0% complete
Processing 39.0% complete
Processing 43.0% complete
Processing 47.0% complete
Processing 51.0% complete
Processing 55.00000000000001% complete
Processing 57.99999999999999% complete
Processing 62.0% complete
Processing 66.0% complete
Processing 70.0% complete
Processing 74.0% complete
Processing 78.0% complete
Processing 82.0% complete
Processing 86.0% complete
Processing 90.0% complete
Processing 94.0% complete
Processing 97.0% complete


In [18]:
# Save raw predictions to output path
pickle.dump(predictions, open(raw_output_path, "wb"))

In [19]:
# if just loading existing pickled predictions:
# with open(raw_output_path, 'rb') as pickle_file:
#     predictions = pickle.load(pickle_file)

In [20]:
# check length of predictions
len([item for sublist in predictions for item in sublist])

256580

In [21]:
# check length of text
len(worklist)

256580

In [22]:
# add predictions to main df
flat_list = [item for sublist in predictions for item in sublist]
monologic_df["mono_type"] = flat_list
monologic_df[['prob_0','prob_1']] = pd.DataFrame(monologic_df["mono_type"].tolist(), index=monologic_df.index)
monologic_df["mono_type"] = np.where(monologic_df['prob_1'] > .50, 1, 0) # this is the column we're interested in, since this is a binary label

In [23]:
# monologic_df = monologic_df[["key", "mono_type"]]

In [24]:
monologic_df.to_pickle(combined_output_path)

In [25]:
# if just loading existing pickled predictions:
# with open(combined_output_path, 'rb') as pickle_file:
#     df = pickle.load(pickle_file)

In [26]:
Counter(monologic_df["mono_type"])

Counter({1: 175801, 0: 80779})

In [27]:
monologic_df.sample(10)

Unnamed: 0,opinion_num,category,author,case,year,token_count,text,sent_index,length,sent_location,percent_letter,predictions,prob_0,prob_1,monologic_prediction,chief_justice,key,mono_type
653301,5628,Majority,Justice Vinson,United States v. Wyoming,1947,4937,Having decided that plaintiff has title to Sec...,102,144,0.708333,81.21547,"[0.030479712411761284, 0.9695203304290771]",0.001204,0.998796,1,Vinson,568099,1
536026,4631,Majority,Justice Kennedy,"Ragsdale v. Wolverine World Wide, Inc.",2002,3840,"In sustaining the regulation, we observed that...",117,183,0.639344,84.134615,"[0.022143594920635223, 0.977856457233429]",0.001325,0.998675,1,Rehnquist,466279,1
448329,3896,Majority,Justice Kennedy,"Hiibel v. Sixth Judicial Dist. Court of Nev., ...",2004,3548,We disagree.,105,204,0.514706,83.333333,"[0.011153098195791245, 0.9888468980789185]",0.001542,0.998458,1,Rehnquist,390288,1
680357,5864,Majority,Justice Rehnquist,Washington v. Glucksberg,1997,6819,"Respondents contend that in Cruzan we ""acknowl...",200,316,0.632911,81.777778,"[0.04204067215323448, 0.9579592943191528]",0.001397,0.998603,1,Rehnquist,591399,1
216805,1881,Dissenting,Justice Rehnquist,Boeing Co. v. Van Gemert,1980,1728,"Fearful that, by waiting for a ""final order"" i...",56,61,0.918033,78.417266,"[0.013144643977284431, 0.9868553280830383]",0.995382,0.004618,1,Burger,188436,0
1695972,14769,Majority,Justice Blackmun,California v. Acevedo,1991,4464,Our holding today neither extends the Carroll ...,207,219,0.945205,84.076433,"[0.029793500900268555, 0.9702064990997314]",0.001692,0.998308,1,Rehnquist,1475046,1
1265734,10869,Concurring,Justice O'Connor,"Cruzan v. Director, Mo. Dept. of Health",1990,1143,Because our notions of liberty are inextricabl...,5,51,0.098039,83.898305,"[0.039655111730098724, 0.9603449106216431]",0.002496,0.997504,1,Rehnquist,1100335,1
1091874,9357,Majority,Justice O'Connor,Bread Political Action Committee v. FEC,1982,1888,"As we have said: ""Jurisdictional statutes are ...",17,68,0.25,80.451128,"[0.02490919642150402, 0.9750908613204956]",0.001339,0.998661,1,Burger,949012,1
252565,2216,Dissenting,Justice Black,Wisconsin v. Constantineau,1971,452,It seems to me therefore wholly uncertain that...,9,14,0.642857,81.681682,"[0.013690881431102753, 0.9863090515136719]",0.9952,0.0048,1,Burger,219635,0
715090,6144,Majority,Justice Breyer,Colorado Republican Federal Campaign Comm. v. ...,1996,11106,"I As an initial matter, I write to make clear ...",306,482,0.634855,79.497908,"[0.010768194682896137, 0.9892318844795227]",0.994932,0.005068,1,Rehnquist,621195,0


In [28]:
monologic_df[monologic_df["mono_type"] ==1].sample(100).text.to_list()

['We disagree.',
 '8 *747 A closer issue under our precedents is presented by the contention that the Authority could become deeply involved in the day-to-day financial and policy decisions of the College.',
 'We granted certiorari.',
 'Neither of our two more recent opinions construing \x9e 1985(3) has answered the question left open in Griffin or has involved the second clause of the statute.',
 '*86 We are advised that the provisions of the Alaska Constitution at issue have never been interpreted by an Alaska court.',
 'It is important to emphasize that statutory waiver requirements always mandate, by their plain terms, that courts shall not consider arguments not properly raised before the agency; we have never Cite as: 548 U. S. ____ (2006) 9 STEVENS, J., dissenting suggested that the word “exhaustion,” standing alone, imposes a statutory waiver requirement.',
 'It was federal priority attaching as of the time of insolvency that we adjudicated, not something less.',
 'Respondent a

In [29]:
combined_csv_path

'/content/drive/MyDrive/SCOTUS/output/mono_types_bert/combined_output_indi_coll.csv'

In [30]:
monologic_df.to_csv(combined_csv_path)

In [31]:
monologic_df.sample(10)

Unnamed: 0,opinion_num,category,author,case,year,token_count,text,sent_index,length,sent_location,percent_letter,predictions,prob_0,prob_1,monologic_prediction,chief_justice,key,mono_type
541274,4672,Concurring,Justice Burger,Bifulco v. United States,1980,274,But we perform that task by beginning with the...,4,14,0.285714,82.105263,"[0.011401386000216007, 0.9885985851287842]",0.001374,0.998626,1,Burger,470792,1
543395,4686,Dissenting,Justice Brennan,Dellmuth v. Muth,1989,2676,"I entirely fail to see, for example, why the ""...",55,91,0.604396,79.574468,"[0.01011836901307106, 0.9898815751075745]",0.995131,0.004869,1,Rehnquist,472605,0
1580531,13697,Per Curiam,per_curiam,Goett v. Union Carbide Corp.,1960,764,To facilitate our discretionary review of *344...,23,27,0.851852,82.936508,"[0.05258449539542198, 0.9474154710769653]",0.002056,0.997944,1,Warren,1374139,1
10452,68,Dissenting,Justice Breyer,Carson v. Makin,2022,4847,But we have never said that the Free Exercise ...,175,217,0.806452,82.352941,"[0.014892960898578167, 0.9851070046424866]",0.001376,0.998624,1,Roberts,9041,1
179853,1566,Dissenting,Justice Rutledge,In Re Yamashita,1946,8683,"We are technically still at war, because peace...",53,310,0.170968,82.417582,"[0.01034399401396513, 0.9896559715270996]",0.001504,0.998496,1,Vinson,155883,1
908464,7776,Dissenting,Justice Jackson,Terminiello v. Chicago,1949,7703,"I said, `Fellow Christians, and I suppose ther...",55,354,0.155367,74.242424,"[0.012985742650926113, 0.9870142936706543]",0.994521,0.005479,1,Vinson,789617,0
1103332,9451,Majority,Justice Breyer,Meyer v. Holley,2003,2996,We conclude that the Act imposes liability wit...,3,151,0.019868,82.969432,"[0.011002135463058949, 0.9889978766441345]",0.001172,0.998828,1,Rehnquist,958902,1
279618,2435,Dissenting,Justice White,Lindahl v. Office of Personnel Management,1985,2521,The majority begins by asserting that the lang...,6,90,0.066667,81.165919,"[0.380939245223999, 0.6190608143806458]",0.981076,0.018924,1,Burger,243079,0
1573936,13629,Majority,Justice Blackmun,Bates v. State Bar of Ariz.,1977,11543,III Although I disagree strongly with the Cour...,459,499,0.91984,83.858268,"[0.010314499028027058, 0.9896855354309082]",0.994786,0.005214,1,Burger,1368480,0
375634,3272,Dissenting,Justice Douglas,McKeiver v. Pennsylvania,1971,4440,"""We discovered that during the past five and a...",67,182,0.368132,75.510204,"[0.01617925800383091, 0.9838207960128784]",0.001314,0.998686,1,Burger,326868,1


### Combine with main dataframe (and non monologic sentences)

In [32]:
monologic_df = pd.read_csv(combined_csv_path)
monologic_df = monologic_df[["key", "mono_type"]]

In [33]:
# merge df and monologic df on key
combined_df = pd.merge(df, monologic_df, on = "key", how = "left")

In [34]:
combined_df.sample(15)

Unnamed: 0,opinion_num,category,author,case,year,token_count,text,sent_index,length,sent_location,percent_letter,predictions,prob_0,prob_1,monologic_prediction,chief_justice,key,mono_type
1435590,14339,Majority,Justice Ginsburg,Mount Lemmon Fire Dist. v. Guido,2018,1684,Federal courts have divided on this question.,31,78,0.397436,84.444444,"[0.9010551571846008, 0.09894488751888275]",0.901055,0.098945,0,Roberts,1435591,
361648,3609,Majority,Justice Rehnquist,Pace v. DiGuglielmo,2005,2457,"See Irwin v. Department of Veterans Affairs, s...",121,126,0.960317,70.588235,"[0.9976402521133423, 0.002359757199883461]",0.99764,0.00236,0,Roberts,361649,
450509,4461,Majority,Justice Kennedy,Miller-El v. Cockrell,2003,6771,Subsection (d)(2) contains the unreasonable *3...,219,298,0.734899,77.622378,"[0.9978221654891968, 0.0021778687369078398]",0.997822,0.002178,0,Rehnquist,450510,
1140100,11250,Concurring,Justice O'Connor,Rufo v. Inmates of Suffolk County Jail,1992,1854,*396 Public officials often operate within dif...,30,78,0.384615,81.25,"[0.9974014759063721, 0.002598523162305355]",0.997401,0.002599,0,Rehnquist,1140101,
90681,912,Majority,Justice Clark,Rohr Corp. v. San Diego County,1960,1982,Nor can we agree that the short administrative...,77,83,0.927711,84.0,"[0.016062777489423752, 0.9839372038841248]",0.016063,0.983937,1,Warren,90682,1.0
1509224,15139,Dissenting,Justice Sotomayor,Lucia v. SEC,2018,1437,The Court noted that STJs could enter final de...,59,71,0.830986,82.608696,"[0.9977372884750366, 0.0022626486606895924]",0.997737,0.002263,0,Roberts,1509225,
632258,6238,Majority,Justice White,"NLRB v. Sears, Roebuck & Co.",1975,7306,"Finally, and more comprehensively, all ""identi...",10,266,0.037594,80.991736,"[0.998166024684906, 0.0018339701928198338]",0.998166,0.001834,0,Burger,632259,
1007965,9945,Majority,Justice Blackmun,Dawson Chemical Co. v. Rohm & Haas Co.,1980,12082,"As a result, it is no longer necessary to reso...",249,550,0.452727,78.787879,"[0.9978703260421753, 0.0021296944469213486]",0.99787,0.00213,0,Burger,1007966,
929498,9159,Majority,Justice Ginsburg,"Illinois Ex Rel. Lisa Madigan, Attorney Genera...",2003,5828,. . antifraud laws to prohibit professional fu...,217,272,0.797794,82.677165,"[0.9975606203079224, 0.002439416479319334]",0.997561,0.002439,0,Rehnquist,929499,
1034844,10224,Dissenting,Justice Brennan,United States v. Leon,1984,12217,"Therefore, although the Courts decisions are c...",248,417,0.594724,83.524904,"[0.020887399092316628, 0.9791126251220703]",0.020887,0.979113,1,Burger,1034845,0.0


In [35]:
combined_df.monologic_prediction.unique()

array([0, 1])

In [36]:
combined_df.mono_type.unique()

array([nan,  1.,  0.])

In [37]:
output_path

'/content/drive/MyDrive/SCOTUS/output/mono_types_bert'

In [38]:
combined_df.to_csv(os.path.join(output_path, 'combined_main_df.csv'))

In [39]:
combined_df[combined_df["mono_type"] ==1].sample(10).text.tolist()

['And when another similarly situated defendant comes before us, we must grant the same relief or give a principled reason for acting differently.',
 'In particular, we find nothing in the exclusion of bacteria from plant variety protection to support the petitioners position.',
 'We must conclude that, absent deliberately coercive or improper tactics in obtaining the initial statement, the mere fact that a suspect has made an unwarned admission does not warrant a presumption of compulsion.',
 'We have held that the freedom could be overridden "by regulations adopted to serve compelling state interests, unrelated to the suppression of ideas, that cannot be achieved through means significantly less restrictive of associational freedoms."',
 'II We now turn to the Boards application of its Station KKHI no-presumption approach in this case.',
 'In Bellotti, we struck down a statute requiring a minor to obtain the consent of both parents before having an abortion, subject to a judicial byp

In [40]:
combined_df[combined_df["mono_type"] ==0].sample(10).text.tolist()

['I thus believe that it is incumbent upon us to make a careful study of the facts and opinions below in this case, and at least to embark upon the formulation of standards for the application of § 7 to mergers which are neither horizontal nor vertical and which previously have not been considered in depth by this Court.',
 'I regret that I cannot join an opinion which fails to give due consideration to the unmistakable intent of the Social Security Act to give HEW primary jurisdiction over these highly technical and difficult welfare questions, which affirms what is to me a clear abuse of discretion by the District Court, and which plunges this Court and other federal courts into an ever-increasing and unnecessary involvement in the administration of the Nations categorical assistance programs administered by the States.',
 '. . involves significantly different questions of analysis and policy," ante, at 51 n. 18, I suspect this purported distinction may be as difficult to justify as 

In [41]:
pickle.dump(combined_df, open(os.path.join(output_path, 'pickled_big_file'), "wb"))

In [42]:
pickle.dump(combined_df, open(os.path.join(output_path, 'pickled_big_file_2'), "wb"), protocol = 4)

In [43]:
pickle_path = os.path.join(output_path, 'pickled_big_file')

In [44]:
with open(pickle_path, 'rb') as pickle_file:
  new = pickle.load(pickle_file)

In [45]:
new.monologic_prediction.unique()

array([0, 1])

In [46]:
output_path

'/content/drive/MyDrive/SCOTUS/output/mono_types_bert'