# Information Extraction from Unstructured Data
Built an LLM-based system to process large volumes of litigation files—extracting key information, organizing content, and enabling fast search, filtering, and review.

In [1]:
!pip install transformers PyPDF2

from transformers import pipeline
from pathlib import Path
import json
import time
from PyPDF2 import PdfReader
import pandas as pd



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pdf_dir = Path("legal_processing/pdfs")
result_dir = Path("legal_processing/results")
result_dir.mkdir(parents=True, exist_ok=True)

In [5]:
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

Device set to use mps:0


In [7]:
extraction_questions = {
    "case_number": "What is the case number or docket number?",
    "parties": "Who are the parties involved in this case?",
    "court": "Which court is handling this case?",
    "date": "What is the date of the ruling or decision?",
    "summary": "What is this case about? Give a brief summary.",
}

In [11]:
def get_full_text(pdf_path):
    reader = PdfReader(pdf_path)
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text() + "\n"
    return full_text

def extract_info(text):
    extracted = {}
    for key, question in extraction_questions.items():
        result = qa_pipeline(question=question, context=text)
        extracted[key] = result["answer"]
    return extracted

def process_pdf(pdf_path):
    try:
        print(f"Processing: {pdf_path.name}")
        text = get_full_text(str(pdf_path))     
        result = extract_info(text)      

        output_file = result_dir / f"{pdf_path.stem}.json"
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2)
        return (pdf_path.name, "Success")
    except Exception as e:
        return (pdf_path.name, f"Failed: {e}")

In [12]:
start = time.time()
all_results = []

for pdf_path in pdf_dir.glob("*.pdf"):
    print(f"Processing: {pdf_path.name}")
    text = get_full_text(str(pdf_path))
    result = extract_info(text)
    result["file"] = pdf_path.name
    
    # Save JSON per file
    output_file = result_dir / f"{pdf_path.stem}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2)
    
    all_results.append(result)

print(f"Done. Processed {len(all_results)} files in {round(time.time() - start, 2)} seconds.")


Processing: case_1.pdf
Processing: case_3.pdf
Processing: case_2.pdf
Processing: case_6.pdf
Processing: case_7.pdf
Processing: case_5.pdf
Processing: case_4.pdf
Processing: case_9.pdf
Processing: case_8.pdf
Processing: case_10.pdf
Done. Processed 10 files in 92.41 seconds.


In [14]:
import json
from pathlib import Path

result_dir = Path("legal_processing/results")

for json_file in result_dir.glob("*.json"):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
        print(f"\n File: {json_file.name}")
        print(json.dumps(data, indent=2))


 File: case_5.json
{
  "case_number": "8\u20139",
  "parties": "management-\nlevel employees",
  "court": "Fifth Circuit",
  "date": "June 2, 2025",
  "summary": "acts of discrimination",
  "file": "case_5.pdf"
}

 File: case_9.json
{
  "case_number": "443",
  "parties": "Sherbert  and Yoder",
  "court": "R. Nelson",
  "date": "May 27, 2025",
  "summary": "a \nplan to build a road near religious sites",
  "file": "case_9.pdf"
}

 File: case_8.json
{
  "case_number": "433\u2013434",
  "parties": "the Government",
  "court": "Distri ct Court",
  "date": "January 20, 2025",
  "summary": "the Government has plainly failed to satisfy\nits burden of demonstrating irreparable harm",
  "file": "case_8.pdf"
}

 File: case_4.json
{
  "case_number": "563 P. 3d, at 1053",
  "parties": "four current or former Seattle police officers",
  "court": "Washington Supreme Court",
  "date": "June 4, 2025",
  "summary": "disclosure at issue in this  case is consistent with the \nFirst Amendment",
  "file":