<b><p style="font-size: XX-large"><font color = "green">Retrieval-Augmented Generation for United States Congressional Bills and Laws (Pre-Processing)</font></p></b> </div>
<b><p style="font-size: X-large"><font color = "green">Andrew Cai (cai.and@northeastern.edu)</font></p></b> </div>
<b><p style="font-size: X-large"><font color = "green">Joynae Whitehurst (whitehurst.j@northeastern.edu)</font></p></b> </div>
<b><p style="font-size: X-large"><font color = "green">Lili Xiang (xiang.l@northeastern.edu)</font></p></b> </div>
<b><p style="font-size: X-large"><font color = "green">April 24, 2025</font></p></b> </div>

# Packages

In [None]:
# File Retrieval
import os
import requests
import zipfile
from io import BytesIO

# Nice to have:
from tqdm import tqdm

# Text Cleaning
import xml.etree.ElementTree as ET
import re
import html
import unicodedata
from ftfy import fix_text

# Data Processing
import pandas as pd
import numpy as np
import ast  # To safely evaluate string representations of lists

# Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Topic Tagging/Embedding
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import torch
import faiss
import accelerate
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet

# Pre-Trained Model Retrieval
from transformers import pipeline


# Bill File Retrieval

In [None]:
# Function to download and extract bill summaries
def download_and_extract_bills(congress, bill_type, save_dir="congressional_bills"):
    """Retrieve and download bills from govinfo.gov as XML files

    Args:
        congress (int): Congress number.
        bill_type (str): Bill category.
        save_dir (str, optional): Directory where files will be stored. Defaults to "congressional_bills".
    """
    # Base URL for downloading bill summaries
    BASE_URL = "https://www.govinfo.gov/bulkdata/BILLSUM/{congress}/{bill_type}/BILLSUM-{congress}-{bill_type}.zip"

    # Construct URL
    url = BASE_URL.format(congress=congress, bill_type=bill_type)
    
    # Create the base directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Define path for ZIP file
    zip_filename = f"BILLSUM-{congress}-{bill_type}.zip"
    zip_path = os.path.join(save_dir, zip_filename)
    
    print(f"Downloading: {url}")
    response = requests.get(url, stream=True)
    
    if response.status_code == 200:
        # Save ZIP file
        with open(zip_path, "wb") as f:
            f.write(response.content)
        print(f"Saved ZIP: {zip_path}")

        # Create extraction folder named after the ZIP file (without .zip)
        extract_folder = os.path.join(save_dir, zip_filename.replace(".zip", ""))
        os.makedirs(extract_folder, exist_ok=True)  # Ensure folder exists

        # Extract ZIP contents into the specific folder
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(extract_folder)
        
        print(f"Extracted to: {extract_folder}")
    
    else:
        print(f"Failed to download {url}. Status Code: {response.status_code}")



# Pre-Processing

## Topic Tagging

In [None]:
# remove problematic keyword arguments if present, which found during debugging
_original_accelerator_init = accelerate.Accelerator.__init__

def new_accelerator_init(self, *args, **kwargs):
    """Accelerator initialization
    """
    for key in ['dispatch_batches', 'even_batches', 'use_seedable_sampler']:
        kwargs.pop(key, None)
    _original_accelerator_init(self, *args, **kwargs)

accelerate.Accelerator.__init__ = new_accelerator_init

# set up the device to accommodate difference: use the MPS backend if available
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# choose smallest Flan-T5
model_name = "google/flan-t5-small"

# load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)

# load the dataset from Hugging Face
dataset = load_dataset("dreamproit/bill_labels_us")

# define maximum sequence lengths for inputs and targets
max_input_length = 512
max_target_length = 64


def preprocess_function(examples):
    """preprocessing function to tokenize inputs (bill text) and targets (policy area)

    Args:
        examples (str): String of text

    Returns:
        model inputs: Model labels for input
    """
    inputs = examples["text"]
    targets = examples["policy_area"]
    # Tokenize the input text (bill content)
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    # Tokenize the target labels (policy tags)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# tokenize the entire dataset using multiple processes for speedup
tokenized_dataset = dataset.map(preprocess_function, batched=True, num_proc=4)

# in case the dataset doesn't have an explicit 'train' and 'test' split
if "train" not in tokenized_dataset.keys() or "test" not in tokenized_dataset.keys():
    tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

# for MPS, disable mixed precision flags (bf16/fp16)
if device.type == "mps":
    training_args = Seq2SeqTrainingArguments(
        output_dir="./flan-t5-finetuned-bill_labels",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        save_strategy="epoch",
        predict_with_generate=True,
        fp16=False,
    )
else:
    # bf16 for supported CPU devices 
    training_args = Seq2SeqTrainingArguments(
        output_dir="./flan-t5-finetuned-bill_labels",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        save_strategy="epoch",
        predict_with_generate=True,
        bf16=True,  
    )

# create a data collator that dynamically pads the inputs
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# initialize the Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# start training
trainer.train()

# save the fine-tuned model and tokenizer
model.save_pretrained("./flan-t5-finetuned-bill_labels")
tokenizer.save_pretrained("./flan-t5-finetuned-bill_labels")

In [None]:
# Load fine-tuned tokenizer and model
def load_model(path="./checkpoint-26904"):
    """Load in pre-trained topic modeler

    Args:
        path (str, optional): Path to saved model. Defaults to "./checkpoint-26904".

    Returns:
        tokenizer and model for topic tagging
    """
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSeq2SeqLM.from_pretrained(path)
    return tokenizer, model


# Perform inference for multiple topic tags
def infer_topics(tokenizer, model, text, top_k=5):
    """Generate list of topics matching body of text

    Args:
        tokenizer (tokenizer): Topic tokenizer
        model (model): Topic tag model
        text (str): Text to be tagged
        top_k (int, optional): Top topic matches. Defaults to 5.

    Returns:
        topics: A list of strings of topics
    """
    in_tensor = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        output = model.generate(
            in_tensor.input_ids, 
            max_length=64, 
            num_beams=10, # Max topics
            num_return_sequences=top_k,  # Generate multiple outputs
            early_stopping=True
        )
    
    topics = tokenizer.batch_decode(output, skip_special_tokens=True)
    return topics  # Return a list of topics

# Load model once
tokenizer, model = load_model()


## Text Cleaning

In [None]:
def clean_text(text):
    """Removes HTML tags, decodes HTML entities, normalizes whitespace, and expands abbreviations.

    Args:
        text (str): Body of text

    Returns:
        text: Cleaned body of text
    """
    text = fix_text(text)  # Fix encoding issues
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode characters
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = html.unescape(text)  # Decode HTML entities (&nbsp;, &amp;, etc.)
    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
    text = re.sub(r'[“”‘’]', '"', text)  # Normalize quotation marks
    text = re.sub(r"[^\w\s.,;:'\"!?-]", '', text)  # Remove unexpected symbols (except punctuation)
    return text.strip()


## Text Chunking Function

In [None]:
def chunk_text(text, chunk_size=500, overlap=50):
    """Chunk text for easier LLM processing

    Args:
        text (str): A body of text
        chunk_size (int, optional): How much text in a chunk. Defaults to 500.
        overlap (int, optional): Overlap between chunks. Defaults to 50.

    Returns:
        Chunked texts from big body of text based on size and overlap
    """
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return text_splitter.split_text(text)

## File and Data Processing

In [None]:
def xml_bill_parse(xml_file):
    """Parse through bills XML to retrieve, tag, chunk relevant data

    Args:
        xml_file (XML): Bill in XML form

    Returns:
        bill_data: List of bill info
    """
    # XML variables
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Convert bill category from nickname to actual name
    bill_category = ["hconres", "hjres", "hr", "hres", "s", "sconres", "sjres", "sres"]
    full_name = ["House Concurrent Resolution", "House Joint Resolution", "House Bill", "House Simple Resolution", "Senate Bill", "Senate Concurrent Resolution", "Senate Joint Resolution", "Senate Simple Resolution"]
    
    # Initialize empty list to store data
    bill_data = []
    
    # Data parsing
    for item in root.findall("item"):
        summary_text = clean_text(item.find(".//summary-text").text.strip()) if item.find(".//summary-text") is not None else None
        title = clean_text(item.find("title").text.strip()) if item.find("title") is not None else None
        topic_tag = infer_topics(tokenizer, model, summary_text, 10) if summary_text else ["Unknown"]
        text_chunks = chunk_text(summary_text) if summary_text else []
        measure_type = full_name[bill_category.index(item.get("measure-type"))]
        # Each chunk gets its own row
        for chunk in text_chunks:
            bill_info = {
                "congress": item.get("congress"),
                "measure_type": measure_type,
                "measure_number": item.get("measure-number"),
                "measure_id": item.get("measure-id"),
                "origin_chamber": item.get("originChamber"),
                "current_chamber": item.find("summary").get("currentChamber"),
                "orig_publish_date": item.get("orig-publish-date"),
                "update_date": item.get("update-date"),
                "title": title,
                "action_date": item.find(".//action-date").text.strip() if item.find(".//action-date") is not None else None,
                "action_desc": item.find(".//action-desc").text.strip() if item.find(".//action-desc") is not None else None,
                "summary_text": summary_text,
                "topic_tags": topic_tag,  # Multiple topics as a list
                "text_chunk": chunk  # One chunk per row
            }
            bill_data.append(bill_info)

    return bill_data  # Return list of expanded rows


def process_bills(directory):
    """Parse through all XML files and compile data into a CSV

    Args:
        directory (file path): Where main folder that houses the other folders
    """
    print("Files and directories in:", directory, os.listdir(directory))  # Debugging

    # Initialzie empty lists
    all_bills = []
    xml_files = []
    
    # Find XML files
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".xml"):
                xml_files.append(os.path.join(root, filename))

    # Use tqdm and suppress inner function prints
    for file_path in tqdm(xml_files, desc="Processing Bills", unit="file", leave=True):
        try:
            bills = xml_bill_parse(file_path)  # Extract data from XML
            all_bills.extend(bills)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")

    # Convert data in dataframe
    df = pd.DataFrame(all_bills)

    if df.empty:
        print("No data extracted! Check file paths and XML structure.")
    else:
        print(f"Successfully processed {len(df)} rows.")

    df.to_csv("parsed_bills.csv", index=False)

# Data Processing Run

In [None]:
# Directory where bills are extracted
BILLS_DIR = "congressional_bills"

# Congress' of interest
congress_number = [115, 116, 117, 118, 119]

# Bill Categories
bill_category = ["hconres", "hjres", "hr", "hres", "s", "sconres", "sjres", "sres"]

# Download Bills
for num in congress_number:
    for cat in bill_category:
        download_and_extract_bills(num, cat, save_dir=BILLS_DIR)

# Run the function to process bills
process_bills(BILLS_DIR)

Downloading: https://www.govinfo.gov/bulkdata/BILLSUM/115/hconres/BILLSUM-115-hconres.zip
Saved ZIP: congressional_bills\BILLSUM-115-hconres.zip
Extracted to: congressional_bills\BILLSUM-115-hconres
Downloading: https://www.govinfo.gov/bulkdata/BILLSUM/115/hjres/BILLSUM-115-hjres.zip
Saved ZIP: congressional_bills\BILLSUM-115-hjres.zip
Extracted to: congressional_bills\BILLSUM-115-hjres
Downloading: https://www.govinfo.gov/bulkdata/BILLSUM/115/hr/BILLSUM-115-hr.zip
Saved ZIP: congressional_bills\BILLSUM-115-hr.zip
Extracted to: congressional_bills\BILLSUM-115-hr
Downloading: https://www.govinfo.gov/bulkdata/BILLSUM/115/hres/BILLSUM-115-hres.zip
Saved ZIP: congressional_bills\BILLSUM-115-hres.zip
Extracted to: congressional_bills\BILLSUM-115-hres
Downloading: https://www.govinfo.gov/bulkdata/BILLSUM/115/s/BILLSUM-115-s.zip
Saved ZIP: congressional_bills\BILLSUM-115-s.zip
Extracted to: congressional_bills\BILLSUM-115-s
Downloading: https://www.govinfo.gov/bulkdata/BILLSUM/115/sconres/BIL

In [None]:
# General file name
save_df = f"parsed_bills_{congress_number[0]}-{congress_number[-1]}"

# Read in and rename parsed file
df_load = pd.read_csv("parsed_bills.csv")
df_load.to_csv(save_df, index=False)

# Lower file size by removing summary text
df_load_chunk = df_load.drop('summary_text', axis=1)
df_load_chunk.to_csv(save_df + "_chunks_only.csv" , index=False)


121238


<bound method DataFrame.info of         congress                 measure_type  measure_number  \
0            115  House Concurrent Resolution               1   
1            115  House Concurrent Resolution              10   
2            115  House Concurrent Resolution             100   
3            115  House Concurrent Resolution             100   
4            115  House Concurrent Resolution             101   
...          ...                          ...             ...   
121233       119     Senate Simple Resolution               8   
121234       119     Senate Simple Resolution              81   
121235       119     Senate Simple Resolution              81   
121236       119     Senate Simple Resolution              88   
121237       119     Senate Simple Resolution               9   

             measure_id origin_chamber current_chamber orig_publish_date  \
0         id115hconres1          HOUSE           HOUSE        2017-01-03   
1        id115hconres10          HO

# Embedding and Indexing Data

In [None]:
# Retriee only chunk dataframe
df_chunk = df_load_chunk

# Load Sentence Transformer for embeddings
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Convert text chunks to embeddings using batch processing
embeddings = embed_model.encode(df_chunk["text_chunk"].tolist(), batch_size=32, show_progress_bar=True)

# Convert the 2D NumPy array into a list of lists for Pandas
df_chunk["embedding"] = embeddings.tolist()

# Save to CSV
df_chunk.to_csv(save_df + "_chunks_only_embedded.csv", index=False)

# Convert stored string embeddings back to NumPy arrays
df_chunk["embedding"] = df_chunk["embedding"].apply(lambda x: np.array(ast.literal_eval(x), dtype=np.float32))

# Get embedding dimension
embedding_dim = df_chunk["embedding"][0].shape[0]

# Initialize FAISS index
faiss_index = faiss.IndexFlatL2(embedding_dim)

# Convert embeddings to a NumPy array
embeddings = np.vstack(df_chunk["embedding"].values)

# Batch size for adding embeddings
batch_size = 1000

# Add embeddings in batches with progress tracking
for i in tqdm(range(0, len(embeddings), batch_size), desc="Indexing embeddings"):
    faiss_index.add(embeddings[i : i + batch_size])

# Save FAISS index
faiss.write_index(faiss_index, "bill_embeddings.index")