In [1]:
# !pip install sentence-transformers faiss-cpu
# !pip install sentence-transformers
# !pip install fastparquet
# !pip install langchain-openai
# !pip install langchain>=0.3.0 langchain-community
# !pip install -U langchain-ollama



In [2]:
#environment / terminal
# python -m pip install "langchain>=0.3.0" langchain-community langchain-openai
# pip install -U langchain-ollama


In [3]:
# ==========================================
# DEPENDENCIES
# ==========================================

import pandas as pd
import json

import re
from pathlib import Path



In [4]:
# ==========================================
# CONSTANTS
# ==========================================

# with open('/Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/data/raw/ContractNLI/contract-nli/train.json', "r") as f:
#     data = json.load(f)
# 
# df1 = pd.json_normalize(data)

json_path='/Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/data/raw/ContractNLI/contract-nli/train.json'

In [5]:
###########################LOADING JSON FILE###################


json_path = Path(json_path)
# with json_path.open("r", encoding="utf-8") as f:
#     data = json.load(f)
# data

In [6]:
###########################NORMALIZING###################

# text = text.replace("\r\n", "\n").replace("\r", "\n").strip()


# Prepping clause df

In [7]:
import json
import re
from pathlib import Path

import pandas as pd

# =========================
# 0. Config
# =========================

# Adjust these paths for your setup
# BASE_DIR = Path(__file__).resolve().parent


output_clauses_csv = "/Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/contractnli_clauses.csv"
output_labels_csv =  "/Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/contractnli_clause_labels.csv"

# =========================
# 1. Load JSON
# =========================

print(f"Loading ContractNLI JSON from: {json_path}")

with json_path.open("r", encoding="utf-8") as f:
    data = json.load(f)

documents = data["documents"]
print(f"Loaded {len(documents)} documents from {json_path}")

# =========================
# 2. Prepare regexes
# =========================

NUMBERED_CLAUSE_RE = re.compile(r"(?m)^\d+(\.\d+)*\s")  # '1 ', '2.3 ', '2.3.2.1 '
RECITALS_RE = re.compile(r"(?m)^RECITALS\b")
WHEREAS_RE = re.compile(r"(?m)^WHEREAS\b")
NOW_THEREFORE_RE = re.compile(r"(?m)^NOW,\s+THEREFORE\b")
IN_WITNESS_RE = re.compile(r"(?m)^IN WITNESS WHEREOF\b")
DOUBLE_NEWLINE_RE = re.compile(r"\n\s*\n")  # paragraph breaks

# =========================
# 3. Build clause table (clauses_df)
# =========================

clauses_rows: list[dict] = []

for doc in documents:
    doc_id = doc.get("id")
    file_name = doc.get("file_name", "")
    raw_text = doc.get("text", "") or ""

    # Only normalize line breaks; keep indices consistent
    text = raw_text.replace("\r\n", "\n").replace("\r", "\n")

    if text == "":
        continue

    # 3.1 Find clause boundaries
    boundaries = set()
    boundaries.add(0)
    boundaries.add(len(text))

    for m in NUMBERED_CLAUSE_RE.finditer(text):
        boundaries.add(m.start())

    for regex in (RECITALS_RE, WHEREAS_RE, NOW_THEREFORE_RE, IN_WITNESS_RE):
        for m in regex.finditer(text):
            boundaries.add(m.start())

    for m in DOUBLE_NEWLINE_RE.finditer(text):
        boundaries.add(m.end())

    boundaries_list = sorted(boundaries)

    # 3.2 Build clauses within this document
    clause_id = 0
    for i in range(len(boundaries_list) - 1):
        start = boundaries_list[i]
        end = boundaries_list[i + 1]
        if start >= end:
            continue

        clause_text = text[start:end]
        if clause_text.strip() == "":
            continue

        clause_id += 1

        clauses_rows.append(
            {
                "document_id": doc_id,
                "file_name": file_name,
                "clause_id": clause_id,
                "char_start": start,
                "char_end": end,
                "clause_text": clause_text,
            }
        )

clauses_df = pd.DataFrame(clauses_rows)
print("\nClause-level table shape:", clauses_df.shape)
print("First 5 clauses:")
print(clauses_df.head())

# =========================
# 4. Build labeled table (label_df)
# =========================

label_rows: list[dict] = []

for doc in documents:
    doc_id = doc.get("id")
    file_name = doc.get("file_name", "")
    raw_text = doc.get("text", "") or ""
    text = raw_text.replace("\r\n", "\n").replace("\r", "\n")

    if text == "":
        continue

    # Reconstruct clauses for this document (same logic as above)
    boundaries = set()
    boundaries.add(0)
    boundaries.add(len(text))

    for m in NUMBERED_CLAUSE_RE.finditer(text):
        boundaries.add(m.start())
    for regex in (RECITALS_RE, WHEREAS_RE, NOW_THEREFORE_RE, IN_WITNESS_RE):
        for m in regex.finditer(text):
            boundaries.add(m.start())
    for m in DOUBLE_NEWLINE_RE.finditer(text):
        boundaries.add(m.end())

    boundaries_list = sorted(boundaries)

    clauses = []
    clause_id = 0
    for i in range(len(boundaries_list) - 1):
        start = boundaries_list[i]
        end = boundaries_list[i + 1]
        if start >= end:
            continue
        clause_text = text[start:end]
        if clause_text.strip() == "":
            continue
        clause_id += 1
        clauses.append(
            {
                "clause_id": clause_id,
                "char_start": start,
                "char_end": end,
                "clause_text": clause_text,
            }
        )

    if not clauses:
        continue

    # Helper: find clause for a given span by midpoint
    def find_clause_for_span(span_start: int, span_end: int):
        mid = (span_start + span_end) // 2
        for c in clauses:
            if c["char_start"] <= mid < c["char_end"]:
                return c["clause_id"], c["clause_text"]
        return None, None

    annotation_sets = doc.get("annotation_sets", [])
    if not annotation_sets:
        continue

    ann = annotation_sets[0]
    ann_dict = ann.get("annotations", {})
    span_list = doc.get("spans", [])

    for question_id, ann_entry in ann_dict.items():
        label_choice = ann_entry.get("choice")
        span_indices = ann_entry.get("spans", [])

        if not span_indices:
            continue

        for span_idx in span_indices:
            if span_idx < 0 or span_idx >= len(span_list):
                continue
            span_start, span_end = span_list[span_idx]
            mapped_clause_id, mapped_clause_text = find_clause_for_span(span_start, span_end)

            label_rows.append(
                {
                    "document_id": doc_id,
                    "file_name": file_name,
                    "question_id": question_id,
                    "label": label_choice,
                    "span_index": span_idx,
                    "span_start": span_start,
                    "span_end": span_end,
                    "clause_id": mapped_clause_id,
                    "clause_text": mapped_clause_text,
                }
            )

label_df = pd.DataFrame(label_rows)

if not label_df.empty:
    label_df = label_df.sort_values(
        ["document_id", "question_id", "clause_id", "span_index"]
    ).reset_index(drop=True)

print("\nLabeled clause-level table shape:", label_df.shape)
print("First 10 labeled rows:")
print(label_df.head(10))

# =========================
# 5. Save to disk (CSV)
# =========================

clauses_df.to_csv(output_clauses_csv, index=False)
label_df.to_csv(output_labels_csv, index=False)

print(f"\nSaved clauses to: {output_clauses_csv}")
print(f"Saved labeled clause table to: {output_labels_csv}")


Loading ContractNLI JSON from: /Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/data/raw/ContractNLI/contract-nli/train.json
Loaded 423 documents from /Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/data/raw/ContractNLI/contract-nli/train.json

Clause-level table shape: (2790, 6)
First 5 clauses:
   document_id                                          file_name  clause_id  \
0           34  Annex E_Non-Disclosure and Confidentiality Agr...          1   
1           34  Annex E_Non-Disclosure and Confidentiality Agr...          2   
2           34  Annex E_Non-Disclosure and Confidentiality Agr...          3   
3           34  Annex E_Non-Disclosure and Confidentiality Agr...          4   
4           34  Annex E_Non-Disclosure and Confidentiality Agr...          5   

   char_start  char_end                                        clause_text  
0           0       692  NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...  
1         692       701                       

In [8]:
label_df

Unnamed: 0,document_id,file_name,question_id,label,span_index,span_start,span_end,clause_id,clause_text
0,34,Annex E_Non-Disclosure and Confidentiality Agr...,nda-1,Entailment,14,1294,1683,6,"NOW, THEREFORE, the Parties agree as follows:\..."
1,34,Annex E_Non-Disclosure and Confidentiality Agr...,nda-10,Entailment,51,7292,7645,17,2.6 The Recipient will not copy or reproduce t...
2,34,Annex E_Non-Disclosure and Confidentiality Agr...,nda-12,Entailment,30,3834,3924,16,2.5 The Recipient shall not be precluded from ...
3,34,Annex E_Non-Disclosure and Confidentiality Agr...,nda-12,Entailment,34,4263,4364,16,2.5 The Recipient shall not be precluded from ...
4,34,Annex E_Non-Disclosure and Confidentiality Agr...,nda-13,Entailment,30,3834,3924,16,2.5 The Recipient shall not be precluded from ...
...,...,...,...,...,...,...,...,...,...
8336,624,1693664_0001193125-18-171470_d426098dex99d3.htm,nda-7,Entailment,25,4330,4480,1,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...
8337,624,1693664_0001193125-18-171470_d426098dex99d3.htm,nda-7,Entailment,104,19711,19832,1,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...
8338,624,1693664_0001193125-18-171470_d426098dex99d3.htm,nda-7,Entailment,105,19832,19886,1,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...
8339,624,1693664_0001193125-18-171470_d426098dex99d3.htm,nda-7,Entailment,106,19886,20060,1,Exhibit (d)(3)\nNON-DISCLOSURE AGREEMENT\nThis...


In [9]:
clauses_df

Unnamed: 0,document_id,file_name,clause_id,char_start,char_end,clause_text
0,34,Annex E_Non-Disclosure and Confidentiality Agr...,1,0,692,NON-DISCLOSURE AND CONFIDENTIALITY AGREEMENT\n...
1,34,Annex E_Non-Disclosure and Confidentiality Agr...,2,692,701,RECITALS\n
2,34,Annex E_Non-Disclosure and Confidentiality Agr...,3,701,964,"WHEREAS in connection with RFP/2014/620, Reque..."
3,34,Annex E_Non-Disclosure and Confidentiality Agr...,4,964,1099,WHEREAS UNHCR agrees to provide such data and ...
4,34,Annex E_Non-Disclosure and Confidentiality Agr...,5,1099,1248,WHEREAS the Bidder is willing to ensure that U...
...,...,...,...,...,...,...
2785,623,1689602_0001144204-16-140940_v455386_ex10-3.htm,3,550,786,"WHEREAS, [●] (the “Restricted Party”) acknowle..."
2786,623,1689602_0001144204-16-140940_v455386_ex10-3.htm,4,786,1149,"WHEREAS, as a material inducement to the Alliq..."
2787,623,1689602_0001144204-16-140940_v455386_ex10-3.htm,5,1149,18032,"NOW, THEREFORE, in consideration of the recita..."
2788,623,1689602_0001144204-16-140940_v455386_ex10-3.htm,6,18032,18476,"IN WITNESS WHEREOF, the parties have executed ..."


# setting up vector db

In [10]:
from pathlib import Path

import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# =========================
# 0. Config
# =========================

BASE_DIR = "/Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/"
clauses_csv_path = BASE_DIR + "contractnli_clauses.csv"

# Convert FAISS directory into a *Path object*
faiss_store_dir = Path(BASE_DIR + "vectorstores/contractnli_faiss")
faiss_store_dir.mkdir(parents=True, exist_ok=True)

# =========================
# 1. Load clauses
# =========================

print(f"Using clauses CSV: {clauses_csv_path}")
print(f"FAISS store directory: {faiss_store_dir}")

clauses_df = pd.read_csv(clauses_csv_path, dtype={"document_id": str, "clause_id": str})

print(f"\nLoaded clauses_df with shape: {clauses_df.shape}")
print("Columns:", list(clauses_df.columns))

# Drop any empty clauses just in case
clauses_df = clauses_df[clauses_df["clause_text"].astype(str).str.strip() != ""].copy()
print("After dropping empty clause_text rows:", clauses_df.shape)

# Prepare texts and metadata
texts = clauses_df["clause_text"].tolist()
metadatas = clauses_df[["document_id", "file_name", "clause_id"]].to_dict(orient="records")

print("\nExample metadata row:")
print(metadatas[0])

# =========================
# 2. Build embeddings
# =========================

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"\nLoading HuggingFaceEmbeddings model: {embedding_model_name}")

embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    encode_kwargs={"normalize_embeddings": True},
)

# =========================
# 3. Build FAISS vector store
# =========================

print(f"\nBuilding FAISS vector store over {len(texts)} clauses...")

vector_store = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
)

print("FAISS store created.")
print("Index size (ntotal) should match number of clauses.")

# =========================
# 4. Save FAISS store
# =========================

print(f"\nSaving FAISS store to: {faiss_store_dir}")

vector_store.save_local(str(faiss_store_dir))

print("FAISS store saved successfully.")


Using clauses CSV: /Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/contractnli_clauses.csv
FAISS store directory: /Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/vectorstores/contractnli_faiss

Loaded clauses_df with shape: (2790, 6)
Columns: ['document_id', 'file_name', 'clause_id', 'char_start', 'char_end', 'clause_text']
After dropping empty clause_text rows: (2790, 6)

Example metadata row:
{'document_id': '34', 'file_name': 'Annex E_Non-Disclosure and Confidentiality Agreement.pdf', 'clause_id': '1'}

Loading HuggingFaceEmbeddings model: sentence-transformers/all-MiniLM-L6-v2


  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm



Building FAISS vector store over 2790 clauses...
FAISS store created.
Index size (ntotal) should match number of clauses.

Saving FAISS store to: /Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/vectorstores/contractnli_faiss
FAISS store saved successfully.


# RAG

In [11]:
# ===============================
# PART 3 — RAG with FAISS + HF Transformers
# ===============================
from pathlib import Path

from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline


# ---------------------------------------
# Paths
# ---------------------------------------
base_dir = Path("/Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook")
faiss_store_dir = base_dir / "vectorstores/contractnli_faiss"

print("Loading FAISS from:", faiss_store_dir)


# ---------------------------------------
# Load Embeddings
# ---------------------------------------
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


# ---------------------------------------
# Load FAISS index
# ---------------------------------------
db = FAISS.load_local(
    folder_path=str(faiss_store_dir),
    embeddings=embeddings,
    allow_dangerous_deserialization=True
)

retriever = db.as_retriever(search_kwargs={"k": 4})
print("FAISS retriever ready.")


# ---------------------------------------
# Use HuggingFace Transformers (CPU-friendly)
# ---------------------------------------
model_name = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

hf_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.0
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)


# ---------------------------------------
# Prompt for RAG
# ---------------------------------------
prompt = ChatPromptTemplate.from_template("""
You are a legal contract analysis assistant.

Use ONLY the context provided below to answer the user question.
If the answer is not found in the retrieved clauses, say:
"The contract does not contain this information."

Context:
{context}

Question:
{question}

Answer in clear legal language.
""")


# ---------------------------------------
# Format docs
# ---------------------------------------
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])


# ---------------------------------------
# Build RAG chain
# ---------------------------------------
rag_chain = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG chain with HuggingFace Transformer ready.")


Loading FAISS from: /Users/samarthsingh/PycharmProjects/conrad_law_llm_chatbot/notebook/vectorstores/contractnli_faiss
FAISS retriever ready.


Device set to use mps:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  llm = HuggingFacePipeline(pipeline=hf_pipeline)


RAG chain with HuggingFace Transformer ready.


In [13]:
response = rag_chain.invoke("What does the NDA say about confidentiality obligations?")
print(response)


The obligations of the Parties under this Agreement shall be in addition to and not in lieu of any obligations under other confidentiality agreement(s) or obligations of confidence between the Parties and/or between the Parties, solely or jointly, and/or any third party.


In [None]:
# use  chainlit run src/chainlit_app.py -w to execute