In [1]:
import os

scotus_dir = '/content/drive/MyDrive/SCOTUS/'
indiv_collect_annotations = os.path.join(scotus_dir, 'annotations', 'indi_coll_annotations.json')
model_output_path = os.path.join(scotus_dir, 'models', 'mono_types_bert')

In [2]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 18.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 62.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    U

In [3]:
# Basic Python modules
from collections import defaultdict, Counter
import random
import pickle
import json
import regex as re

# For data manipulation and analysis
import pandas as pd
import numpy as np

# For machine learning tools and evaluation
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, cross_val_score, train_test_split

# For deep learning
# https://pytorch.org/tutorials/beginner/basics/quickstart_tutorial.html
import torch

# using DistilBERT for testing --> can switch to BERT once set up
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

## Clean the data

In [4]:
# helpful functions
def clean_regex(df, column):
  
  df["text"] = df["text"].str.replace('\n', ' ')
  df["text"] = df["text"].replace('\s+', ' ', regex = True)
  df["text"] = df["text"].replace(r'\[','', regex=True) 
  df["text"] = df["text"].replace(r'\]','', regex=True)
  df["text"] = df["text"].replace(r'\- ','', regex=True)
  df["text"] = df["text"].replace(r'\xad','', regex=True)
  df["text"] = df["text"].replace(r'\'','', regex=True)
  df["text"] = df["text"].replace(r'\x97',',', regex=True)

  return df["text"]

In [5]:
# Keep only sentences above certain threshold of alphanumeric characters
def percent_text(text):
    char_dict = dict()
    char_dict["alpha_count"] = 0
    char_dict["total_count"] = 0

    for char in text:
        char_dict["total_count"] += 1
        if char.isalpha():
            char_dict["alpha_count"] += 1
    
    percent_letter = float(char_dict["alpha_count"]) / float(char_dict["total_count"]) * 100

    return percent_letter

In [6]:
def header_eraser(text):
    spaces = re.search(r'[ \t]{2,}', text)
    opinion = re.search(r'Opinion of', text)
    if spaces and opinion:
        # delete text between first space and opinion of + 20 char 
        result = re.sub('[ \t]{2,}.*?Opinion of[\s\S]{15}', '', text)
    else:
        result = text
    return result

In [7]:
binary_annotations = []
for line in open(indiv_collect_annotations, 'r'):
    binary_annotations.append(json.loads(line))
binary_df = pd.DataFrame(binary_annotations)

# get binary_annotations that are ignore --> remove and add to 0 general_df
non_monologic_new = binary_df[binary_df["answer"] == "ignore"]

binary_df = binary_df[binary_df["answer"] != "ignore"]

binary_df['accept'] = [','.join(map(str, l)) for l in binary_df['accept']]
binary_df["label_num"] = binary_df["accept"].map({'COLLECTIVE': 1, 'INDIVIDUALISTIC': 0})


In [8]:
binary_df = binary_df.dropna(subset=['label_num'])

In [9]:
binary_df.label_num = binary_df.label_num.astype(int)

In [10]:
# Clean sentences
binary_df["text"] = clean_regex(binary_df, "text")

# Get sentences with more letters
binary_df["percent_letter"] = binary_df["text"].apply(percent_text)
binary_df = binary_df[binary_df["percent_letter"] > 50]

# Remove header
binary_df["text"] = binary_df["text"].apply(header_eraser)

## Set up classification task

In [11]:
# Choose the BERT model that we want to use (make sure to keep the cased/uncased consistent)
model_name = 'distilbert-base-cased'  

# Choose the GPU we want to process this script
device_name = 'cuda'       

# This is the maximum number of tokens in any document sent to BERT
max_length = 512                                                        

In [12]:
# Mount the Google drive for access to files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Set up training and testing sets
X = binary_df["text"].to_list()
y = binary_df["label_num"].to_list()

In [14]:
print("Number of collective label: " + str(Counter(y)[1]))
print("Number of individualistic label: " + str(Counter(y)[0]))
print("Total labeled sentences: " + str(len(y)))

Number of collective label: 1256
Number of individualistic label: 562
Total labeled sentences: 1818


In [15]:
train_texts, test_texts, train_labels, test_labels = train_test_split(X, y, test_size = 0.25)

In [16]:
Counter(train_labels)

Counter({0: 423, 1: 940})

### BERT Encoding 

In [17]:
# load the encoder/tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [18]:
test_texts[1:10]

['We stated in Gideon v. Wainwright, 372 U.S. 335, 344, "From the very beginning, our state and national constitutions and laws have laid great emphasis on procedural and substantive safeguards designed to assure fair trials before impartial tribunals in which every defendant stands equal before the law."',
 'I agree that Kansas’ prosecutions and convictions of respondents for identity theft and making false information are not pre-empted by §101(a)(1) of the Immigration Reform and Control Act of 1986, 8 U.S. C. §1324a.',
 'We observed in Scheiner that the Commerce Clause " `by its own force created an area of trade free from interference by the States.',
 'We are unsure whether it was presented below and whether, in any event, there is record support for it.',
 'Hence, it is said that we can hold that the companys employees are engaged in the production of goods for interstate commerce only if we say that their work in supplying water to the farmers is an integral part of the producti

In [19]:
# Pass training/testing sentences to tokenizer, truncate them if over max length, and add padding (PAD tokens up to 512)
train_encodings = tokenizer(train_texts,  truncation=True, padding=True)
test_encodings = tokenizer(test_texts,  truncation=True, padding=True)

## Convert into a Torch Dataset
Combine encoded text and labels into a torch dataset object.

In [20]:
class SCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [21]:
train_dataset = SCDataset(train_encodings, train_labels)
test_dataset = SCDataset(test_encodings, test_labels)

## Set up the training task

Choose the arguments that will be used with the HuggingFace TrainingArguments object, that will be passed to the HuggingFace Trainer object. 

In [22]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy='steps',
)

Load the pretrained model and send this to cuda. This pretrained model is trained on a range of English language texts, like Wikipedia entries or books. When fine-tuning it, we make it more attuned to our corpus (in this case, reddit posts about endometriosis).

In [23]:
model = DistilBertForSequenceClassification.from_pretrained(model_name).to(device_name)

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier

In [24]:
# Define a custom evaluation function (this could be changes to return accuracy metrics)
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

Create the trainer object based on what we've set up prior to this point! This combines our `model`, `training_args`, `train_dataset` and `test_dataset`, and custom evaluation function `compute_metrics`. 

In [25]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,            # evaluation dataset
    compute_metrics=compute_metrics      # custom evaluation function
)

Fine-tune the model on our dataset/labels. The trainer object will periodically output the state of the model.

In [26]:
trainer.train()

***** Running training *****
  Num examples = 1363
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 258


Step,Training Loss,Validation Loss,Accuracy
10,0.7778,0.721649,0.305495
20,0.6694,0.614346,0.694505
30,0.6217,0.575796,0.694505
40,0.525,0.341801,0.953846
50,0.2564,0.160792,0.949451
60,0.2535,0.121401,0.96044
70,0.1314,0.123568,0.962637
80,0.0679,0.148377,0.962637
90,0.2198,0.139118,0.958242
100,0.0437,0.144714,0.96044


***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evaluation *****
  Num examples = 455
  Batch size = 20
***** Running Evalua

TrainOutput(global_step=258, training_loss=0.18846073932945728, metrics={'train_runtime': 142.4305, 'train_samples_per_second': 28.709, 'train_steps_per_second': 1.811, 'total_flos': 176673994625844.0, 'train_loss': 0.18846073932945728, 'epoch': 3.0})

In [28]:
# built in evaluation function
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 455
  Batch size = 20


{'epoch': 3.0,
 'eval_accuracy': 0.9692307692307692,
 'eval_loss': 0.12011811882257462,
 'eval_runtime': 3.3152,
 'eval_samples_per_second': 137.246,
 'eval_steps_per_second': 6.938}

### Save the model

In [29]:
model_output_path

'/content/drive/MyDrive/SCOTUS/models/mono_types_bert'

In [30]:
model.save_pretrained(model_output_path)

Configuration saved in /content/drive/MyDrive/SCOTUS/models/mono_types_bert/config.json
Model weights saved in /content/drive/MyDrive/SCOTUS/models/mono_types_bert/pytorch_model.bin


## Assess performance

In [31]:
Counter(test_labels)

Counter({0: 139, 1: 316})

In [32]:
predicted_labels = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 455
  Batch size = 20


In [33]:
actual_predicted_labels = predicted_labels.predictions.argmax(-1)
Counter(actual_predicted_labels)

Counter({0: 139, 1: 316})

In [34]:
Counter(predicted_labels.label_ids.flatten())

Counter({0: 139, 1: 316})

In [35]:
from sklearn.metrics import classification_report
class_report = classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten(), output_dict=True)
print(classification_report(predicted_labels.label_ids.flatten(), actual_predicted_labels.flatten()))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95       139
           1       0.98      0.98      0.98       316

    accuracy                           0.97       455
   macro avg       0.96      0.96      0.96       455
weighted avg       0.97      0.97      0.97       455

