In [1]:
!pip install transformers faiss-cpu



In [2]:
import os
import json
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize FAISS index
embedding_dim = 384  # Adjust based on the embedding model
index = faiss.IndexFlatL2(embedding_dim)
metadata = []  # To store metadata for each passage

# Load embedding model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(text):
    """Generate embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

# Directory containing JSON files
json_dir = "./json"  # Update this path based on your folder structure


2024-12-04 09:35:51.848691: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Parse JSON files and add embeddings to FAISS index
for file_name in os.listdir(json_dir):
    if not file_name.endswith(".json"):
        print(f"Skipping non-JSON file: {file_name}")
        continue

    file_path = os.path.join(json_dir, file_name)
    try:
        with open(file_path, "r") as f:
            data = json.load(f)
        
        # Process the JSON content
        for opinion in data.get("casebody", {}).get("opinions", []):
            text = opinion.get("text", "")
            if not text:
                continue  # Skip if no text found
            
            # Split into smaller passages if too long
            passages = [text[i:i + 300] for i in range(0, len(text), 300)]
            for passage in passages:
                embedding = embed_text(passage)
                index.add(embedding)
                metadata.append({"file": file_name, "text": passage})
    except json.JSONDecodeError:
        print(f"Invalid JSON in file: {file_name}")
    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

# Save FAISS index and metadata
faiss.write_index(index, "legal_cases_index.faiss")
with open("metadata.json", "w") as f:
    json.dump(metadata, f)

print("Indexing completed!")


Skipping non-JSON file: ideas.qmd
Indexing completed!


In [4]:
import json

# Load and inspect metadata
with open("metadata.json", "r") as f:
    metadata = json.load(f)
    print(f"Number of passages indexed: {len(metadata)}")
    print("Sample metadata entry:", metadata[0] if metadata else "No entries found.")


Number of passages indexed: 3777
Sample metadata entry: {'file': '0001-01.json', 'text': 'DAWSON, District Judge.\nPetitioner, by his guardian, ad litem, sets forth that he is unlawfully restrained of his liberty by Lieutenant Commander J. S. Newell, naval officer in charge at this station, and in command of the United States steamer and man-of-war Pinta. He states that he was enlisted in'}
