In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import random

In [3]:
# Dummy dataset
legal_documents = [
    "In the event of a default by either party, this agreement may be terminated by written notice to the defaulting party.",
    "The lessee shall maintain the property in good condition and make any necessary repairs.",
    "Confidential information shall not be disclosed to any third party without prior written consent.",
    "The party shall indemnify and hold harmless the other party from any claims arising out of the performance of this agreement.",
    "The buyer shall pay the seller the purchase price in accordance with the terms set forth herein."
]

In [4]:
#Tokenization & Embedding
# Load Sentence Transformer Model for Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each document
document_embeddings = embedding_model.encode(legal_documents, convert_to_tensor=True)




In [5]:
#FAISS Vector database for retrieval
# Convert embeddings to numpy array
document_embeddings_np = document_embeddings.detach().cpu().numpy()

# Initialize FAISS index
dimension = document_embeddings_np.shape[1]
index = faiss.IndexFlatL2(dimension)

# Add document embeddings to the FAISS index
index.add(document_embeddings_np)


In [6]:
#RAG
# Function to retrieve similar documents
def retrieve_similar_documents(query, top_k=2):
    query_embedding = embedding_model.encode([query], convert_to_tensor=True).detach().cpu().numpy()
    distances, indices = index.search(query_embedding, top_k)
    return [legal_documents[idx] for idx in indices[0]]

# Test Retrieval Function
test_query = "What are the terms for termination?"
similar_docs = retrieve_similar_documents(test_query)
print("Retrieved Documents:", similar_docs)


Retrieved Documents: ['In the event of a default by either party, this agreement may be terminated by written notice to the defaulting party.', 'The party shall indemnify and hold harmless the other party from any claims arising out of the performance of this agreement.']


In [7]:
#Fine-Tuning (PEFT) with Prompt Engineering
# Load Pre-trained BERT Summarization Model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# Function to Generate Summaries
def generate_summary(text, max_length=50):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Test Summarization
for doc in similar_docs:
    summary = generate_summary(doc)
    print("Summary:", summary)


Summary: This agreement may be terminated by written notice to the defaulting party. In the event of a default by either party, this agreement could be terminated.
Summary: summarize: The party shall indemnify and hold harmless the other party from any claims arising out of the performance of this agreement. The parties agree that they will not sue each other for damages.


In [9]:
#Measure performance: document retrieval speed, summarization accuracy
import time

# Measure Retrieval Speed
start_time = time.time()
retrieve_similar_documents(test_query)
end_time = time.time()
print("Retrieval Time (ms):", (end_time - start_time) * 1000)

# Accuracy and Summary Length could be computed by comparing against a ground-truth set if available.


Retrieval Time (ms): 164.3538475036621
