                                    [Raw “facts” files]  
                                      ↓ clean & chunk  
                                    [Chunked text files]  
                                      ↓ embed (Sentence-Transformer)  
                                    [FAISS vector DB]  
                                      ↓ (at query time) embed(query) → FAISS.search → top-K chunks  
                                      ↓ build prompt (snippets + instruction)  
                                    [Prompt]  
                                      ↓ Llama-2 (fine-tuned on Zenodo)  
                                    [Generated summary]

### Install & authenticate Kaggle

In [10]:
# In a Colab cell
!pip install --quiet kaggle pandas sentence-transformers faiss-cpu

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [11]:
# Upload your kaggle.json (username + key)
from google.colab import files
uploaded = files.upload()  # select your kaggle.json

# Place it so the CLI can see it
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle (1).json


### Download the dataset

In [12]:
!mkdir -p /content/legal_data/supreme_cp
!kaggle datasets download -d deepcontractor/supreme-court-judgment-prediction \
    --unzip -p /content/legal_data/supreme_cp

Dataset URL: https://www.kaggle.com/datasets/deepcontractor/supreme-court-judgment-prediction
License(s): CC0-1.0


### Load & inspect the CSV

In [14]:
import os
import pandas as pd

DATA_DIR = "/content/legal_data/supreme_cp"

# 1) List and locate the CSV file
files = os.listdir(DATA_DIR)
print("Files in data dir:", files)

csv_files = [f for f in files if f.lower().endswith(".csv")]
if not csv_files:
    raise FileNotFoundError(f"No CSV found in {DATA_DIR}")
csv_path = os.path.join(DATA_DIR, csv_files[0])

# 2) Load and inspect
df = pd.read_csv(csv_path)
print("Loaded CSV:", csv_path)
print("Total cases:", len(df))
print("Columns:", df.columns.tolist())

# 3) View the first few name/facts pairs
print(df[["name", "facts"]].head(3))

Files in data dir: ['justice.csv']
Loaded CSV: /content/legal_data/supreme_cp/justice.csv
Total cases: 3303
Columns: ['Unnamed: 0', 'ID', 'name', 'href', 'docket', 'term', 'first_party', 'second_party', 'facts', 'facts_len', 'majority_vote', 'minority_vote', 'first_party_winner', 'decision_type', 'disposition', 'issue_area']
                      name                                              facts
0              Roe v. Wade  <p>In 1970, Jane Roe (a fictional name used in...
1      Stanley v. Illinois  <p>Joan Stanley had three children with Peter ...
2  Giglio v. United States  <p>John Giglio was convicted of passing forged...


### Clean & prepare documents

In [16]:
import os, re
from tqdm import tqdm

SRC = df
OUT_DIR = "/content/legal_data/supreme_cp/rag_txt"
os.makedirs(OUT_DIR, exist_ok=True)

def clean_text(text: str) -> str:
    # normalize whitespace
    text = text.replace('\r\n','\n').replace('\r','\n')
    text = re.sub(r'\n{2,}', '\n\n', text)
    return "\n".join(line.strip() for line in text.split('\n') if line.strip())

for idx, row in tqdm(SRC.iterrows(), total=len(SRC)):
    name  = re.sub(r'[\\/:"*?<>|]+', '_', str(row["name"]))  # safe filename
    facts = str(row["facts"] or "").strip()
    if not facts:
        continue
    cleaned = clean_text(facts)
    fname = f"{idx:05d}_{name}.txt"
    with open(os.path.join(OUT_DIR, fname), "w", encoding="utf-8") as f:
        f.write(cleaned)

100%|██████████| 3303/3303 [00:00<00:00, 3648.81it/s]


### Chunking longer summaries (e.g. >512 tokens) with overlap

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
CHUNK_DIR = "/content/legal_data/supreme_cp/rag_chunks"
os.makedirs(CHUNK_DIR, exist_ok=True)

MAX_TOKENS  = 512
OVERLAP     = 64

for fn in os.listdir(OUT_DIR):
    text = open(os.path.join(OUT_DIR, fn), encoding="utf-8").read()
    toks = tokenizer.encode(text)
    start = 0
    cid   = 0
    while start < len(toks):
        chunk_toks = toks[start : start + MAX_TOKENS]
        chunk_text = tokenizer.decode(chunk_toks)
        out_name   = fn.replace(".txt", f"_chunk{cid:03d}.txt")
        with open(os.path.join(CHUNK_DIR, out_name), "w", encoding="utf-8") as out:
            out.write(chunk_text)
        start += MAX_TOKENS - OVERLAP
        cid   += 1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (646 > 512). Running this sequence through the model will result in indexing errors


### Embed & build FAISS index

In [21]:
from sentence_transformers import SentenceTransformer
import faiss, pickle, os, numpy as np

# 1) Load/instantiate your embedder and FAISS index
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
dim      = embedder.get_sentence_embedding_dimension()
index    = faiss.IndexFlatL2(dim)

# 2) Prepare a list to hold metadata for each chunk
metadata = []

# 3) Loop over your chunk files, embed & add to FAISS
chunk_folder = "/content/legal_data/supreme_cp/rag_chunks"
for i, fname in enumerate(sorted(os.listdir(chunk_folder))):
    text = open(os.path.join(chunk_folder, fname), encoding="utf-8").read()
    vec  = embedder.encode(text)
    index.add(np.array([vec]))   # vector # i

    # record the mapping for vector i
    metadata.append({
        "chunk_id":  fname.replace(".txt",""),
        "file_path": os.path.join(chunk_folder, fname),
        "case_name": fname.split("_chunk")[0],
        "date":      fname.split("_")[1].replace(".txt","")
    })

# ─────────────── SAVE YOUR ARTIFACTS ────────────────────

os.makedirs("model_rag", exist_ok=True)

# A) write the FAISS index
faiss.write_index(index, "model_rag/legal-facts.index")

# B) write the metadata list
with open("model_rag/index_to_doc.pkl", "wb") as f:
    pickle.dump(metadata, f)

### FastAPI (or any retrieval) service

In [22]:
import faiss, pickle

INDEX_PATH   = "model_rag/legal-facts.index"
MAPPING_PATH = "model_rag/index_to_doc.pkl"

# load vector DB
index = faiss.read_index(INDEX_PATH)
# load the ID→document mapping
with open(MAPPING_PATH, "rb") as f:
    metadata = pickle.load(f)

### Retrieval at query time

In [23]:
import os
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# ─── Configuration ────────────────────────────────────────────────
INDEX_PATH     = "model_rag/legal-facts.index"
MAPPING_PATH   = "model_rag/index_to_doc.pkl"
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"  # pre-trained

# ─── Load Artifacts ──────────────────────────────────────────────
index = faiss.read_index(INDEX_PATH)

with open(MAPPING_PATH, "rb") as f:
    metadata = pickle.load(f)

embedder = SentenceTransformer(EMB_MODEL_NAME)

# ─── Retrieval Function ──────────────────────────────────────────
def retrieve(query: str, top_k: int = 5):
    q_emb = embedder.encode([query])
    distances, indices = index.search(np.array(q_emb), top_k)

    results = []
    for score, idx in zip(distances[0], indices[0]):
        if idx < 0 or idx >= len(metadata):
            continue
        meta = metadata[idx]
        try:
            snippet = open(meta["file_path"], encoding="utf-8").read()
        except FileNotFoundError:
            snippet = ""
        results.append({
            "chunk_id":  meta.get("chunk_id", idx),
            "file_path": meta.get("file_path", ""),
            "case_name": meta.get("case_name", ""),
            "date":      meta.get("date", ""),
            "score":     float(score),
            "text":      snippet
        })
    return results

# ─── Example Usage ───────────────────────────────────────────────
if __name__ == "__main__":
    for res in retrieve("Fourth Amendment search warrant scope", 5):
        print(f"{res['chunk_id']} (score {res['score']}):")
        print(res["text"][:300].replace("\n"," "), "...\n")

01541_Groh v. Ramirez_chunk000 (score 0.6322004795074463):
[CLS] < p > jeff groh, a special agent for the u. s. bureau of alcohol, tobacco, and firearms, applied for a search warrant to search the ramirez ranch for illegal weapons. on the warrant, groh mistakenly omitted the exact items sought ( though he correctly listed the items on the application itself ...

01731_United States v. Grubbs_chunk000 (score 0.7000200748443604):
[CLS] < p > on federal trial for possessing child pornography, grubbs asked the judge to suppress evidence officers seized from his home. grubbs said the search violated the fourth amendment because the officers showed him an " anticipatory warrant, " something valid only after triggering events tak ...

01962_Arizona v. Gant_chunk000 (score 0.7038705945014954):
[CLS] < p > rodney gant was apprehended by arizona state police on an outstanding warrant for driving with a suspended license. after the officers handcuffed gant and placed him in their squad car, they 