In [1]:
import json
import torch
import faiss
import numpy as np
from transformers import AutoModel, AutoTokenizer
from fastapi import FastAPI
import faiss
import numpy as np
import uvicorn
import asyncio
import nest_asyncio

In [3]:
# Load Legal-BERT Model
MODEL_NAME = "nlpaueb/legal-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

# Set to evaluation mode
model.eval()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [3]:
json_path = "merged_outputV5.json"
with open(json_path, "r", encoding="utf-8") as file:
    data = json.load(file)

documents = data.get("documents", [])


In [7]:
# Function to generate embeddings
def get_embedding(text):
    """Generate embedding for a given text using Legal-BERT."""
    if not text:
        return np.zeros((768,))  # Return zero vector for empty text
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy().flatten()  # Extract CLS token


In [9]:
# Prepare text and embeddings
doc_embeddings = []
doc_ids = []

for doc in documents:
    doc_id = doc["DocID"]
    paragraphs = doc.get("paragraphs", [])
    
    for para in paragraphs:
        context_text = para.get("context", "")
        qas_texts = [qa["question"] for qa in para.get("qas", [])]
        csvqas_texts = [csvqa["question"] for csvqa in para.get("CSVqas", [])]
        
        combined_text = f"{context_text} {' '.join(qas_texts)} {' '.join(csvqas_texts)}"
        embedding = get_embedding(combined_text)
        
        doc_embeddings.append(embedding)
        doc_ids.append(doc_id)

In [11]:
# Convert embeddings to FAISS format
doc_embeddings = np.array(doc_embeddings).astype("float32")
faiss_index = faiss.IndexFlatL2(768)  # 768 is the embedding dimension
faiss_index.add(doc_embeddings)

In [13]:
# Save FAISS index
faiss.write_index(faiss_index, "slacara.index")


In [15]:
# Save DocID mapping
with open("docid_mapping.json", "w", encoding="utf-8") as f:
    json.dump(doc_ids, f)

print("✅ Legal-BERT embeddings created and saved successfully!")

✅ Legal-BERT embeddings created and saved successfully!


In [17]:
# Load FAISS Index
faiss_index = faiss.read_index("slacara.index")

# Load DocID Mapping
with open("docid_mapping.json", "r", encoding="utf-8") as f:
    doc_ids = json.load(f)

print(f"✅ FAISS index contains {faiss_index.ntotal} vectors")
print(f"✅ Document ID list contains {len(doc_ids)} entries")

# Validate that embeddings match doc count
assert faiss_index.ntotal == len(doc_ids), "❌ Mismatch between FAISS embeddings and DocIDs!"
print("✅ Embeddings successfully validated!")


✅ FAISS index contains 510 vectors
✅ Document ID list contains 510 entries
✅ Embeddings successfully validated!


In [19]:
def retrieve_similar(query, top_k=5):
    """Retrieve top-k similar documents using FAISS."""
    query_embedding = get_embedding(query).reshape(1, -1)
    distances, indices = faiss_index.search(query_embedding, top_k)

    results = []
    for i in range(top_k):
        doc_id = doc_ids[indices[0][i]]
        results.append({"DocID": doc_id, "Score": distances[0][i]})
    
    return results

# Example Query
query = "What are the terms of the agreement?"
results = retrieve_similar(query)

# Display Results
for result in results:
    print(f"DocID: {result['DocID']}, Score: {result['Score']}")


DocID: 443, Score: 413.92498779296875
DocID: 164, Score: 415.1919860839844
DocID: 93, Score: 416.2054138183594
DocID: 47, Score: 418.238037109375
DocID: 305, Score: 418.8833312988281


In [23]:
# Check a document's original text
doc_index = 0  # Change this index to check different docs
print("📄 Original Document Text:")
print(documents[doc_index]["paragraphs"][0]["context"][:500])  # Print first 500 chars

# Check the corresponding embedding
print("\n📊 Corresponding Embedding Vector:")
print(doc_embeddings[doc_index][:10])  # Print first 10 values


📄 Original Document Text:
CO-BRANDING AND ADVERTISING AGREEMENT

THIS CO-BRANDING AND ADVERTISING AGREEMENT (the "Agreement") is made as of June 21, 1999 (the "Effective Date") by and between I-ESCROW, INC., with its principal place of business at 1730 S. Amphlett Blvd., Suite 233, San Mateo, California 94402 ("i-Escrow"), and 2THEMART.COM, INC. having its principal place of business at 18301 Von Karman Avenue, 7th Floor, Irvine, California 92612 ("2TheMart").

1. DEFINITIONS.

(a) "CONTENT" means all content or informat

📊 Corresponding Embedding Vector:
[ 0.29355064 -0.07363994 -0.52576756  0.44006988 -0.27381048  0.25637665
  0.09302463  0.35416535 -0.22798778  0.48588574]
