                                    [Raw “facts” files]  
                                      ↓ clean & chunk  
                                    [Chunked text files]  
                                      ↓ embed (Sentence-Transformer)  
                                    [FAISS vector DB]  
                                      ↓ (at query time) embed(query) → FAISS.search → top-K chunks  
                                      ↓ build prompt (snippets + instruction)  
                                    [Prompt]  
                                      ↓ Llama-2 (fine-tuned on Zenodo)  
                                    [Generated summary]

### Install & authenticate Kaggle

In [2]:
# In a Colab cell
!pip install --quiet kaggle pandas sentence-transformers faiss-cpu

In [11]:
# Upload your kaggle.json (username + key)

# Make sure the .kaggle folder exists
mkdir -p ~/.kaggle

# Open a text editor to create the file
nano ~/.kaggle/kaggle.json

# Test
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json


### Download the dataset

In [4]:
# create the directory in Python
import os
os.makedirs(os.path.expanduser('~/legal_data/supreme_cp'), exist_ok=True)

# then call kaggle via the ! magic
!kaggle datasets download \
  -d deepcontractor/supreme-court-judgment-prediction \
  -p ~/legal_data/supreme_cp \
  --unzip

Dataset URL: https://www.kaggle.com/datasets/deepcontractor/supreme-court-judgment-prediction
License(s): CC0-1.0
Downloading supreme-court-judgment-prediction.zip to /home/jovyan/legal_data/supreme_cp
  0%|                                               | 0.00/1.33M [00:00<?, ?B/s]
100%|███████████████████████████████████████| 1.33M/1.33M [00:00<00:00, 348MB/s]


### Load & inspect the CSV

In [8]:
import os
import pandas as pd

# Expand '~' to /home/jovyan
DATA_DIR = os.path.expanduser('~/legal_data/supreme_cp')

# Sanity check
assert os.path.isdir(DATA_DIR), f"Directory not found: {DATA_DIR}"

# Directly reference the known CSV
csv_path = os.path.join(DATA_DIR, 'justice.csv')
assert os.path.isfile(csv_path), f"File not found: {csv_path}"

# Load it
df = pd.read_csv(csv_path)

# Inspect
print(f"Loaded: {csv_path}")
print(f"Total rows: {len(df)}")
print("Columns:", df.columns.tolist())

# Display first few rows of 'name' and 'facts'
df[['name', 'facts']].head(3)

Loaded: /home/jovyan/legal_data/supreme_cp/justice.csv
Total rows: 3303
Columns: ['Unnamed: 0', 'ID', 'name', 'href', 'docket', 'term', 'first_party', 'second_party', 'facts', 'facts_len', 'majority_vote', 'minority_vote', 'first_party_winner', 'decision_type', 'disposition', 'issue_area']


Unnamed: 0,name,facts
0,Roe v. Wade,"<p>In 1970, Jane Roe (a fictional name used in..."
1,Stanley v. Illinois,<p>Joan Stanley had three children with Peter ...
2,Giglio v. United States,<p>John Giglio was convicted of passing forged...


### Clean & prepare documents

In [12]:
import os
import re
import pandas as pd
from tqdm import tqdm

# 1) Load the CSV
BASE_DIR   = os.path.expanduser('~/legal_data/supreme_cp')
csv_path = os.path.join(BASE_DIR, 'justice.csv')
df = pd.read_csv(csv_path)

# 2) Prepare output directory
OUT_DIR    = os.path.join(BASE_DIR, 'rag_txt')   
os.makedirs(OUT_DIR, exist_ok=True)

# 3) Cleaning function
def clean_text(text: str) -> str:
    # normalize newlines
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # collapse repeated blank lines
    text = re.sub(r'\n{2,}', '\n\n', text)
    # strip each line and drop empty ones
    lines = [line.strip() for line in text.split('\n')]
    return "\n".join([line for line in lines if line])

# 4) Iterate and write
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Writing RAG texts"):
    name  = str(row.get("name", "")).strip()
    facts = str(row.get("facts", "")).strip()

    if not facts:
        continue

    # sanitize filename (no slashes or illegal chars)
    safe_name = re.sub(r'[\\/:"*?<>|]+', '_', name) or f"case_{idx}"
    cleaned   = clean_text(facts)
    fname     = f"{idx:05d}_{safe_name}.txt"

    out_path = os.path.join(OUT_DIR, fname)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(cleaned)

Writing RAG texts: 100%|██████████| 3303/3303 [00:01<00:00, 3237.39it/s]


### Chunking longer summaries (e.g. >512 tokens) with overlap

In [13]:
import os
from transformers import AutoTokenizer

# text files from previous step
CHUNK_DIR  = os.path.join(BASE_DIR, 'rag_chunks')   # where chunks will go

# 2) Make sure the chunk output directory exists
os.makedirs(CHUNK_DIR, exist_ok=True)

# 3) Load your tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# 4) Chunk settings
MAX_TOKENS = 512
OVERLAP    = 64

# 5) Iterate over each cleaned .txt file and split into overlapping chunks
for fn in os.listdir(OUT_DIR):
    if not fn.lower().endswith(".txt"):
        continue

    # Read the full text
    in_path = os.path.join(OUT_DIR, fn)
    with open(in_path, encoding="utf-8") as f:
        text = f.read()

    # Tokenize once
    toks = tokenizer.encode(text)

    start = 0
    cid   = 0

    # Slide window over tokens
    while start < len(toks):
        chunk_toks = toks[start : start + MAX_TOKENS]
        chunk_text = tokenizer.decode(chunk_toks, skip_special_tokens=True)

        # Name each chunk file: originalname_chunk000.txt
        base, _    = os.path.splitext(fn)
        out_name   = f"{base}_chunk{cid:03d}.txt"
        out_path   = os.path.join(CHUNK_DIR, out_name)

        # Write it out
        with open(out_path, "w", encoding="utf-8") as out:
            out.write(chunk_text)

        # Advance window
        start += MAX_TOKENS - OVERLAP
        cid   += 1

print(f"Finished chunking. Chunks are in: {CHUNK_DIR!r}")

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors


Finished chunking. Chunks are in: '/home/jovyan/legal_data/supreme_cp/rag_chunks'


### Embed & build FAISS index

In [16]:
import os
import numpy as np
import faiss
import pickle
from sentence_transformers import SentenceTransformer

MODEL_DIR   = os.path.join(BASE_DIR, 'model_rag')

# Create a folder for the RAG model artifacts
os.makedirs(MODEL_DIR, exist_ok=True)

# Load embedder and prepare FAISS index
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
dim      = embedder.get_sentence_embedding_dimension()
index    = faiss.IndexFlatL2(dim)

# Loop over your chunk files, embed & add to FAISS
filenames = []
for i, fname in enumerate(sorted(os.listdir(CHUNK_DIR))):
    if not fname.lower().endswith(".txt"):
        continue

    path = os.path.join(CHUNK_DIR, fname)
    with open(path, encoding="utf-8") as f:
        text = f.read()

    vec = embedder.encode(text)
    index.add(np.array([vec]))  # add vector # i
    filenames.append(fname)

# Save the FAISS index and the metadata mapping
index_path = os.path.join(MODEL_DIR, "legal-facts.index")
meta_path  = os.path.join(MODEL_DIR, "index_to_chunk.pkl")

faiss.write_index(index, index_path)
with open(meta_path, "wb") as f:
    pickle.dump(filenames, f)

print(f"Saved FAISS index to: {index_path!r}")
print(f"Saved metadata mapping to: {meta_path!r}")

Saved FAISS index to: '/home/jovyan/legal_data/supreme_cp/model_rag/legal-facts.index'
Saved metadata mapping to: '/home/jovyan/legal_data/supreme_cp/model_rag/index_to_chunk.pkl'


### FastAPI (or any retrieval) service

In [17]:
import os
import faiss
import pickle

INDEX_PATH   = os.path.join(MODEL_DIR, "legal-facts.index")
MAPPING_PATH = os.path.join(MODEL_DIR, "index_to_chunk.pkl")

# Load the FAISS index
index = faiss.read_index(INDEX_PATH)

# Load the ID → chunk‐filename mapping
with open(MAPPING_PATH, "rb") as f:
    metadata = pickle.load(f)

print(f"Loaded FAISS index with {index.ntotal} vectors")
print(f"Loaded metadata for {len(metadata)} chunks")

Loaded FAISS index with 3464 vectors
Loaded metadata for 3464 chunks


### Retrieval at query time

In [18]:
import os
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

INDEX_PATH     = os.path.join(MODEL_DIR, "legal-facts.index")
MAPPING_PATH   = os.path.join(MODEL_DIR, "index_to_chunk.pkl")
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# FAISS index
index = faiss.read_index(INDEX_PATH)

# Metadata: list of chunk filenames, in order of index IDs
with open(MAPPING_PATH, "rb") as f:
    metadata = pickle.load(f)

# Embedder
embedder = SentenceTransformer(EMB_MODEL_NAME)

def retrieve(query: str, top_k: int = 5):
    # Embed the query
    q_emb = embedder.encode([query])
    distances, indices = index.search(np.array(q_emb), top_k)

    results = []
    for score, idx in zip(distances[0], indices[0]):
        if idx < 0 or idx >= len(metadata):
            continue

        # Metadata entry is just the filename
        fname = metadata[idx]
        file_path = os.path.join(CHUNK_DIR, fname)

        # Derive chunk_id from filename (prefix before first '_chunk')
        chunk_id = fname.split("_chunk")[0]

        # Read snippet text
        try:
            with open(file_path, encoding="utf-8") as f:
                snippet = f.read()
        except FileNotFoundError:
            snippet = ""

        results.append({
            "idx":        idx,
            "chunk_id":   chunk_id,
            "file_path":  file_path,
            "score":      float(score),
            "text":       snippet
        })

    return results

if __name__ == "__main__":
    query = "Fourth Amendment search warrant scope"
    top_k = 5
    for res in retrieve(query, top_k):
        print(f"ID {res['idx']} (chunk {res['chunk_id']}, score {res['score']:.3f}):")
        print(res["text"][:300].replace("\n", " "), "...\n")

ID 1565 (chunk 01541_Groh v. Ramirez, score 0.642):
< p > jeff groh, a special agent for the u. s. bureau of alcohol, tobacco, and firearms, applied for a search warrant to search the ramirez ranch for illegal weapons. on the warrant, groh mistakenly omitted the exact items sought ( though he correctly listed the items on the application itself ). a  ...

ID 1758 (chunk 01731_United States v. Grubbs, score 0.701):
< p > on federal trial for possessing child pornography, grubbs asked the judge to suppress evidence officers seized from his home. grubbs said the search violated the fourth amendment because the officers showed him an " anticipatory warrant, " something valid only after triggering events take plac ...

ID 1999 (chunk 01962_Arizona v. Gant, score 0.702):
< p > rodney gant was apprehended by arizona state police on an outstanding warrant for driving with a suspended license. after the officers handcuffed gant and placed him in their squad car, they went on to search his vehic