In [None]:
# In preparing for this project I read the following tutorials. I lost track of precisely what pieces of code and what ideas
# I drew from what influence but here is a list what I believe is everything I read and actually chose to utilize.
# https://medium.com/@abdurhmanfayad_73788/fine-tuning-bert-for-a-multi-label-classification-problem-on-colab-5ca5b8759f3f
# https://huggingface.co/blog/Valerii-Knowledgator/multi-label-classification
# https://towardsai.net/p/artificial-intelligence/fine-tuning-legal-bert-llms-for-automated-legal-text-classification
# https://mccormickml.com/2019/07/22/BERT-fine-tuning/
# https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d
# https://aditya007.medium.com/understanding-the-cls-token-in-bert-a-comprehensive-guide-a62b3b94a941

In [None]:
import pandas as pd
import numpy as np
import pickle
import torch
from transformers import AutoTokenizer, AutoModel

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/TRDataChallenge2023.txt', lines=True)

In [None]:
def concatenate_from_list_of_dicts(row):
    # Get the list of dicts from the sections
    list_of_dicts = row["sections"]

    # Make somewhere to put the flattened out list
    concatenated_text = []

    # Iterate over each dict in the list
    for my_dict in list_of_dicts:
        # Add the header text
        concatenated_text.append(my_dict["headtext"])
        # Add the associated paragraphs
        concatenated_text.extend(my_dict["paragraphs"])

    # Return concatenated string
    return " ".join(concatenated_text)

df["concatenated_text"] = df.apply(concatenate_from_list_of_dicts, axis=1)

In [None]:
# Tokenize and chunk the document into 512 length chunks with half overlap (256)
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")

def chunk_document(text, max_length=512, stride=256):
    tokens = tokenizer.encode(text, add_special_tokens=False)  # Get token IDs (no CLS/SEP yet)
    chunks = []

    # Sliding window approach
    for i in range(0, len(tokens), stride):
        chunk = tokens[i : i + max_length - 2]  # Reserve space for CLS and SEP
        chunk = [tokenizer.cls_token_id] + chunk + [tokenizer.sep_token_id]  # Add [CLS] and [SEP]

        # Pad to max_length if needed
        while len(chunk) < max_length:
            chunk.append(tokenizer.pad_token_id)
        chunks.append(chunk)

        if len(chunk) < max_length:  # Stop if final chunk is shorter than max_length
            break

    return chunks

# You need to have an attention mask because it needs to ignore the padding you put in to make each chunk an even 512 tokens.
def get_attention_masks(chunks):
    return [[1 if token != tokenizer.pad_token_id else 0 for token in chunk] for chunk in chunks]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
# Load Legal-BERT (without classification head)
model = AutoModel.from_pretrained('nlpaueb/legal-bert-base-uncased')

def get_document_embedding(text):

    # Chunk the document
    chunks = chunk_document(text)
    chunk_masks = get_attention_masks(chunks)

    # Ensure chunks are 512 because BERT is expecting it
    assert all(len(chunk) == 512 for chunk in chunks), "Chunk size mismatch!"

    # Convert to PyTorch tensors
    chunk_tensors = torch.tensor(chunks, dtype=torch.long)  # Shape: (num_chunks, 512)
    chunk_masks_tensors = torch.tensor(chunk_masks, dtype=torch.long)  # Same shape

    # Make sure tensors are on GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    chunk_tensors = chunk_tensors.to(device)
    chunk_masks_tensors = chunk_masks_tensors.to(device)

    # Get embeddings for each chunk
    with torch.no_grad():  # No gradients needed for inference
        outputs = model(chunk_tensors, attention_mask=chunk_masks_tensors, output_hidden_states=True)

    # Extract hidden states from the last layer
    hidden_states = outputs.hidden_states[-1]  # Shape: (num_chunks, 512, 768)

    # Average chunk embeddings
    chunk_embeddings = hidden_states[:, 0, :]  # Use CLS token representation (num_chunks, 768) because CLS is the encapsulation of the information from the entire input sequence
    document_embedding = torch.mean(chunk_embeddings, dim=0)  # Average across chunks

    return document_embedding

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
df['embedding'] = df.apply(lambda row: get_document_embedding(row['concatenated_text']).cpu().numpy(), axis=1)

Token indices sequence length is longer than the specified maximum sequence length for this model (5819 > 512). Running this sequence through the model will result in indexing errors


In [None]:
#df.to_pickle("/content/drive/MyDrive/Colab Notebooks/df_w_embeddings.pkl")
#df = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/df_w_embeddings.pkl")

In [None]:
df

Unnamed: 0,documentId,postures,sections,concatenated_text,embedding
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"[{'headtext': '', 'paragraphs': ['Plaintiff Dw...",Plaintiff Dwight Watson (“Husband”) appeals f...,"[0.13452414, 0.1782438, -0.054326914, 0.307920..."
1,Ib06ab4d056a011e98c7a8e995225dbf9,"[Appellate Review, Sentencing or Penalty Phase...","[{'headtext': '', 'paragraphs': ['After pleadi...","After pleading guilty, William Jerome Howard,...","[0.038121928, 0.32856792, -0.3703702, 0.239396..."
2,Iaa3e3390b93111e9ba33b03ae9101fb2,"[Motion to Compel Arbitration, On Appeal]","[{'headtext': '', 'paragraphs': ['Frederick Gr...","Frederick Greene, the plaintiff below, deriva...","[0.3980108, 0.35936612, -0.32921728, 0.1975094..."
3,I0d4dffc381b711e280719c3f0e80bdd0,"[On Appeal, Review of Administrative Decision]","[{'headtext': '', 'paragraphs': ['Appeal from ...",Appeal from an amended judgment of the Suprem...,"[0.17980985, 0.25053117, 0.13405542, 0.2323921..."
4,I82c7ef10d6d111e8aec5b23c3317c9c0,[On Appeal],"[{'headtext': '', 'paragraphs': ['Order, Supre...","Order, Supreme Court, New York County (Arthur...","[0.030824184, 0.04649551, 0.020237295, -0.3026..."
...,...,...,...,...,...
17995,Ia5743cf0e4b611e99e94fcbef715f24d,[Appellate Review],"[{'headtext': '', 'paragraphs': ['¶1 On Februa...","¶1 On February 5, 2017, a jury in the Fifth J...","[0.32977995, -0.38087356, -0.35429198, 0.02168..."
17996,I974c18f08f1611e998e8870e22e55653,[Objection to Proof of Claim],[{'headtext': 'ORDER OVERRULING DEBTOR'S OBJEC...,ORDER OVERRULING DEBTOR'S OBJECTION TO CLAIMS ...,"[0.16209525, -0.14097951, 0.057170607, -0.3524..."
17997,Idaaa92f0886f11e998e8870e22e55653,"[Appellate Review, Trial or Guilt Phase Motion...","[{'headtext': '', 'paragraphs': ['A jury convi...",A jury convicted Antonio Avila Medrano of Con...,"[0.22042644, 0.1092553, -0.8069598, 0.28300485..."
17998,I247a8420677e11e9a072efd81f5238d6,"[Appellate Review, Jury Selection Challenge or...","[{'headtext': '', 'paragraphs': ['Defendant Ch...","Defendant Charles York Walker, Jr., appeals f...","[0.35001236, -0.10290279, -0.6039653, 0.064887..."


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
# One-hot encode the labels
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['postures'])
binary_labels

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
df['ohe_postures'] = [list(arr) for arr in binary_labels]
df["ohe_postures_float"] = df["ohe_postures"].apply(lambda x: [float(i) for i in x])

In [None]:
df

Unnamed: 0,documentId,postures,sections,concatenated_text,embedding,ohe_postures,ohe_postures_float
0,Ib4e590e0a55f11e8a5d58a2c8dcb28b5,[On Appeal],"[{'headtext': '', 'paragraphs': ['Plaintiff Dw...",Plaintiff Dwight Watson (“Husband”) appeals f...,"[0.13452414, 0.1782438, -0.054326914, 0.307920...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Ib06ab4d056a011e98c7a8e995225dbf9,"[Appellate Review, Sentencing or Penalty Phase...","[{'headtext': '', 'paragraphs': ['After pleadi...","After pleading guilty, William Jerome Howard,...","[0.038121928, 0.32856792, -0.3703702, 0.239396...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Iaa3e3390b93111e9ba33b03ae9101fb2,"[Motion to Compel Arbitration, On Appeal]","[{'headtext': '', 'paragraphs': ['Frederick Gr...","Frederick Greene, the plaintiff below, deriva...","[0.3980108, 0.35936612, -0.32921728, 0.1975094...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,I0d4dffc381b711e280719c3f0e80bdd0,"[On Appeal, Review of Administrative Decision]","[{'headtext': '', 'paragraphs': ['Appeal from ...",Appeal from an amended judgment of the Suprem...,"[0.17980985, 0.25053117, 0.13405542, 0.2323921...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,I82c7ef10d6d111e8aec5b23c3317c9c0,[On Appeal],"[{'headtext': '', 'paragraphs': ['Order, Supre...","Order, Supreme Court, New York County (Arthur...","[0.030824184, 0.04649551, 0.020237295, -0.3026...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...
17995,Ia5743cf0e4b611e99e94fcbef715f24d,[Appellate Review],"[{'headtext': '', 'paragraphs': ['¶1 On Februa...","¶1 On February 5, 2017, a jury in the Fifth J...","[0.32977995, -0.38087356, -0.35429198, 0.02168...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17996,I974c18f08f1611e998e8870e22e55653,[Objection to Proof of Claim],[{'headtext': 'ORDER OVERRULING DEBTOR'S OBJEC...,ORDER OVERRULING DEBTOR'S OBJECTION TO CLAIMS ...,"[0.16209525, -0.14097951, 0.057170607, -0.3524...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17997,Idaaa92f0886f11e998e8870e22e55653,"[Appellate Review, Trial or Guilt Phase Motion...","[{'headtext': '', 'paragraphs': ['A jury convi...",A jury convicted Antonio Avila Medrano of Con...,"[0.22042644, 0.1092553, -0.8069598, 0.28300485...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
17998,I247a8420677e11e9a072efd81f5238d6,"[Appellate Review, Jury Selection Challenge or...","[{'headtext': '', 'paragraphs': ['Defendant Ch...","Defendant Charles York Walker, Jr., appeals f...","[0.35001236, -0.10290279, -0.6039653, 0.064887...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
#df.to_pickle("/content/drive/MyDrive/Colab Notebooks/df_w_embeddings_ohe.pkl")
#df = pd.read_pickle("/content/drive/MyDrive/Colab Notebooks/df_w_embeddings_ohe.pkl")