In [18]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, CSVLoader

In [19]:

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")

embadding_model = HuggingFaceEmbeddings(
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
)

llm = ChatGroq(
    model="llama-3.1-8b-instant",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
    api_key= groq_api_key
)


In [20]:
loader = DirectoryLoader(
    path="D:/legal_clause/",
    glob="**/*.csv",
    loader_cls=lambda path: CSVLoader(file_path=path, encoding='utf-8'),  # or "cp1252", "ISO-8859-1"
    show_progress=True
)


docs = loader.load()
len(docs)

  0%|          | 0/395 [00:00<?, ?it/s]

100%|██████████| 395/395 [00:06<00:00, 62.26it/s]


150881

In [40]:
print(docs[0])

page_content='plan, payment or arrangement for any such Highly Compensated Employee; (v) labor dispute or, to the' metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes.csv', 'row': 14}


In [21]:
first_100_docs = docs[:500]

for i, doc in enumerate(first_100_docs):
    print(f"\nDocument {i+1}")
    print("Source:", doc.metadata.get("source"))
    print("Content:", doc.page_content[:150])  # Print first 150 characters



Document 1
Source: D:\legal_clause\absence-of-certain-changes-or-events.csv
Content: clause_text: Absence of Certain Changes or Events. Except as set forth in Section 4.07 of the Disclosure Schedule, from January 26, 1997 to the date o

Document 2
Source: D:\legal_clause\absence-of-certain-changes-or-events.csv
Content: clause_text: Absence of Certain Changes or Events. Since March 31, 1997, except ------------------------------------ as disclosed in the Source Financ

Document 3
Source: D:\legal_clause\absence-of-certain-changes-or-events.csv
Content: clause_text: Absence of Certain Changes or Events. (a) Since December 31, 2017, no event or events have occurred that have had or would reasonably be 

Document 4
Source: D:\legal_clause\absence-of-certain-changes-or-events.csv
Content: clause_text: Absence of Certain Changes or Events. Since the date of the Company Balance Sheet there has not been any Company Material Adverse Effect 

Document 5
Source: D:\legal_clause\absence-of-certa

In [22]:
from collections import Counter

clause_types = []

for doc in first_100_docs:
    content = doc.page_content
    clause_type = content.split(".")[0].replace("clause_text:", "").strip()
    clause_types.append(clause_type)

# Apply value counts (like pandas)
type_counts = Counter(clause_types)

# Sort like pandas' value_counts (descending)
sorted_counts = dict(sorted(type_counts.items(), key=lambda item: item[1], reverse=True))

# Print results
print("Clause Type Value Counts:\n")
for clause_type, count in sorted_counts.items():
    print(f"{clause_type}: {count}")


Clause Type Value Counts:

Absence of Certain Changes or Events: 350
Absence of Certain Changes: 150


In [23]:
docs[:3000]

[Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 0}, page_content="clause_text: Absence of Certain Changes or Events. Except as set forth in Section 4.07 of the Disclosure Schedule, from January 26, 1997 to the date of this Agreement, the Company has conducted its business only in the ordinary course, and there has not been (i) any Company Material Adverse Effect, (ii) except for regular quarterly dividends payable, any declaration, setting aside or payment of any dividend or other distribution (whether in cash, Stock or property) with respect to the Common Stock, (iii) any split, combination or reclassification of any of its capital stock or any issuance or the authorization of any issuance of any other securities in respect of, in lieu of or in substitution for shares of its capital stock, (iv) (A) any granting by the Company or any of its Significant Subsidiaries to any executive officer of the Company or any Significant Subsidiaries

In [24]:
# os.environ["GOOGLE_API_KEY"] = google_api_key

# embedding_model = GoogleGenerativeAIEmbeddings(
#     model="models/embedding-001",  # this is the correct embedding model name
# )

In [25]:
doc_spliter = RecursiveCharacterTextSplitter(
     chunk_size = 100,
     chunk_overlap = 15,
)

docs_chunks = doc_spliter.split_documents(documents = docs[:2500])
docs_chunks

[Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 0}, page_content='clause_text: Absence of Certain Changes or Events. Except as set forth in Section 4.07 of the'),
 Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 0}, page_content='4.07 of the Disclosure Schedule, from January 26, 1997 to the date of this Agreement, the Company'),
 Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 0}, page_content='the Company has conducted its business only in the ordinary course, and there has not been (i) any'),
 Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 0}, page_content='been (i) any Company Material Adverse Effect, (ii) except for regular quarterly dividends payable,'),
 Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 0}, page_content='payable, any 

In [26]:
db = Chroma.from_documents(documents = docs_chunks, embedding = embadding_model)

In [27]:
retriever = db.as_retriever(
    search_type="mmr",
        search_kwargs={'k': 15, 'fetch_k': 50}
    
)

In [28]:
query = "What is the termination clause in the contract?"
docs = retriever.get_relevant_documents(query)
docs

  docs = retriever.get_relevant_documents(query)


[Document(metadata={'source': 'D:\\legal_clause\\access.csv', 'row': 2}, page_content='Agreement and continuing until the earlier to occur of the termination of this Agreement pursuant'),
 Document(metadata={'source': 'D:\\legal_clause\\access-to-information.csv', 'row': 9}, page_content='Date and the termination of this Agreement in accordance with its terms, the Sellers shall cause'),
 Document(metadata={'row': 330, 'source': 'D:\\legal_clause\\additional-agreements.csv'}, page_content='of the reason for termination, it is agreed:'),
 Document(metadata={'row': 99, 'source': 'D:\\legal_clause\\access.csv'}, page_content='Closing Date (or earlier termination of this Agreement) but subject to (i) the other provisions of'),
 Document(metadata={'source': 'D:\\legal_clause\\acceleration.csv', 'row': 237}, page_content='terminate and the Obligations shall immediately become due and payable without presentment, demand,'),
 Document(metadata={'source': 'D:\\legal_clause\\access.csv', 'row': 1

In [29]:
context = "\n\n".join([doc.page_content for doc in docs])
context

"Agreement and continuing until the earlier to occur of the termination of this Agreement pursuant\n\nDate and the termination of this Agreement in accordance with its terms, the Sellers shall cause\n\nof the reason for termination, it is agreed:\n\nClosing Date (or earlier termination of this Agreement) but subject to (i) the other provisions of\n\nterminate and the Obligations shall immediately become due and payable without presentment, demand,\n\nprovided by Seller to Buyer for a period of twelve (12) months from the date of such termination.\n\ntermination of this Agreement in accordance with the provisions of Article IX, and in each case\n\nobligations under this Section 5(a) shall survive the termination of this Agreement and the\n\nthe earlier of the termination of this Agreement and the Closing, (i) the Company shall afford\n\nwith the execution of this Agreement shall not have been terminated and shall remain in full force\n\ntermination and remain in full force and effect.\n

In [None]:
# from langchain.chains import RetrievalQA

# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     retriever=retriever,
#     chain_type="stuff"  # or "map_reduce" / "refine"
# )
# query = "What is the termination clause in the contract?"
# response = qa_chain.run(query)
# print(response)


In [30]:
def ClassifyQuery(query):
   
    prompt = PromptTemplate(

        input_variables=["user_query"],
        template="""
            You are a legal AI assistant that classifies user queries based on their intent.

            Your task is to analyze any kind of user query — whether short, long, or complex — and classify it into **exactly one** of the following three categories:

            1. DefinitionQuery → If the user is asking for the meaning, explanation, or definition of a legal term or concept.
            2. ClauseRetrieval → If the user is asking to find, show, or understand a specific clause from a contract or document.
            3. ComparativeAnalysis → If the user is asking to compare clauses or legal elements between two or more documents.

            Return only one of these three words exactly: `DefinitionQuery`, `ClauseRetrieval`, or `ComparativeAnalysis`.  
            Do not provide any explanation.

            ---

            User Query: "{user_query}"

            Answer:

    """
    )

    parser = StrOutputParser()

    intent_classifier = prompt | llm | parser
    intent = intent_classifier.invoke({"user_query": query})
    
    return intent


In [31]:
definition_prompt = PromptTemplate(
    template="You are a legal assistant. Define the following legal term **strictly using the provided context**\
             only. Do not use external knowledge.:\n\nQuery: {user_query}\n\nContext:\n{context}",
    input_variables=["user_query", "context"]
)

clause_prompt = PromptTemplate(
    template="Find and explain the clause mentioned in the query using **only the provided context**.\
             Do not guess or use outside knowledge.:\n\nQuery: {user_query}\n\nContext:\n{context}",
    input_variables=["user_query", "context"]
)

comparison_prompt = PromptTemplate(
    template="You are a legal assistant. Compare the clauses mentioned in the query using **only the content provided in the context**.\
    Do not include any external assumptions. If one or more clauses are missing in the context, say so clearl. \
    :\n\nQuery: {user_query}\n\nContext:\n{context}",
    input_variables=["user_query", "context"]
)


In [32]:
from langchain.schema.runnable import RunnableLambda, RunnableBranch
parser = StrOutputParser()

branch_chain = RunnableBranch(
    (lambda x: x["intent"] == "DefinitionQuery", definition_prompt | llm | parser),
    (lambda x: x["intent"] == "ClauseRetrieval", clause_prompt | llm | parser),
    (lambda x: x["intent"] == "ComparativeAnalysis", comparison_prompt | llm | parser),
    RunnableLambda(lambda x: "Unknown intent")
)


In [33]:
final_chain = RunnableLambda(
    lambda x: {
        "user_query": x["user_query"],
        "intent": ClassifyQuery(x["user_query"]),
        "context": "\n\n".join([doc.page_content for doc in retriever.get_relevant_documents(x["user_query"])]),
    }
) | branch_chain


In [34]:
query = """"In both the employment agreement and the freelance consultancy contract, there are arbitration clauses governing dispute resolution. I’d like a detailed comparison focusing on the arbitration mechanisms used, the governing law and jurisdiction, and any clauses related to enforcement of arbitration awards. Highlight any major discrepancies in the arbitration approach between the two agreements."
"""
result = final_chain.invoke({"user_query": query})
print(result)


Based on the provided context, there are no explicit clauses mentioned for comparison. However, there are mentions of arbitration clauses in the employment agreement and the freelance consultancy contract.

The context mentions the following arbitration-related information:

1. The employment agreement has an arbitration clause (Mutual Agreement to Arbitrate Claims) attached to it.
2. The employment agreement also mentions arbitration in the context of labor disputes and litigation.
3. The employment agreement mentions arbitration in the context of disputes between the parties, including labor disputes.

However, there is no explicit mention of the arbitration mechanisms used, the governing law and jurisdiction, or any clauses related to enforcement of arbitration awards in the provided context.

Additionally, the freelance consultancy contract is mentioned, but there is no explicit mention of its arbitration clause or any related information.

Therefore, a detailed comparison of the a

In [None]:
# def debug_and_prepare_input(x):
#     user_query = x["user_query"]
#     intent = ClassifyQuery(user_query)
#     context_docs = retriever.get_relevant_documents(user_query)
#     context = "\n\n".join([doc.page_content for doc in context_docs])

#     # 👇 Print debug info
#     print("🟡 DEBUG INFO")
#     print("🔍 Query:", user_query)
#     print("🧠 Intent:", intent)
#     print("📄 Context Sample:", context[:500], "...")  # print first 500 chars

#     return {
#         "user_query": user_query,
#         "intent": intent,
#         "context": context
#     }

# # Use it in final_chain
# from langchain.schema.runnable import RunnableLambda

# final_chain = RunnableLambda(debug_and_prepare_input) | branch_chain
# result = final_chain.invoke({"user_query": query})
# print("\n✅ Final Answer:\n", result)


In [35]:
docs = retriever.get_relevant_documents(query)
docs

[Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes.csv', 'row': 14}, page_content='plan, payment or arrangement for any such Highly Compensated Employee; (v) labor dispute or, to the'),
 Document(metadata={'row': 30, 'source': 'D:\\legal_clause\\additional-agreements.csv'}, page_content='and Work Product Assignment Agreement and Mutual Agreement to Arbitrate Claims, which is attached'),
 Document(metadata={'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv', 'row': 130}, page_content='other than the negotiation, execution and delivery of this Agreement and ancillary matters related'),
 Document(metadata={'source': 'D:\\legal_clause\\access-to-information.csv', 'row': 104}, page_content='dispute or litigation, complying with their obligations under this Agreement or any Ancillary'),
 Document(metadata={'row': 195, 'source': 'D:\\legal_clause\\absence-of-certain-changes-or-events.csv'}, page_content='practices and those contemplated by this Agr

In [36]:
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda, RunnableBranch

load_dotenv()

class QueryRouter:
    def __init__(self, db_path="faiss_index", llm_name="models/gemini-1.5-flash", embedding_model="models/embedding-001"):
        gemini_api_key = os.getenv("GOOGLE_API_KEY")

        # Use Google Generative AI Embeddings
        self.embedding = GoogleGenerativeAIEmbeddings(
            model=embedding_model,
            google_api_key=gemini_api_key
        )

        # Use Gemini Flash via ChatGoogleGenerativeAI
        self.llm = ChatGoogleGenerativeAI(
            model=llm_name,
            temperature=0,
            max_tokens=None,
            google_api_key=gemini_api_key
        )

In [37]:
import pandas as pd

pd.read_csv('D://legal_clause/tax-returns.csv')

Unnamed: 0,clause_text,clause_type
0,SPECIAL TERMS AND CONDITIONS OF TRUST. The fol...,tax-returns
1,SPECIAL TERMS AND CONDITIONS OF TRUST. Section...,tax-returns
2,SPECIAL TERMS AND CONDITIONS OF TRUST. The fol...,tax-returns
3,SPECIAL TERMS AND CONDITIONS OF TRUST. SENIOR ...,tax-returns
4,SPECIAL TERMS AND CONDITIONS OF TRUST. FLORIDA...,tax-returns
...,...,...
135,SPECIAL TERMS AND CONDITIONS OF TRUST. SEMICON...,tax-returns
136,SPECIAL TERMS AND CONDITIONS OF TRUST. MUNICIP...,tax-returns
137,SPECIAL TERMS AND CONDITIONS OF TRUST. DIVIDEN...,tax-returns
138,SPECIAL TERMS AND CONDITIONS OF TRUST. DOW(R) ...,tax-returns


In [38]:
import os
import glob
import pandas as pd
from langchain.docstore.document import Document

In [39]:

# Step 1: Define path to CSV directory
csv_dir = "D:/legal_clause/"

# Step 2: Load and process all CSVs
documents = []                  # For vector DB: includes page_content + clause_type
documents_metadata_only = []   # For MySQL: only metadata (no content)

for csv_file in glob.glob(os.path.join(csv_dir, "*.csv")):
    df = pd.read_csv(csv_file)
    for idx, row in df.iterrows():
        if pd.notnull(row['clause_text']) and pd.notnull(row['clause_type']):
            # Document for vector DB
            documents.append(
                Document(
                    page_content=row['clause_text'],
                    metadata={"clause_type": row['clause_type']}
                )
            )

            # Metadata-only document (for MySQL)
            documents_metadata_only.append({
                "clause_type": row['clause_type'],
                "source": csv_file,
                "row": idx
            })


In [None]:
documents

[Document(metadata={'clause_type': 'limitations'}, page_content='Limitations. Without the consent of each Securityholder affected, an amendment or waiver may not:'),
 Document(metadata={'clause_type': 'limitations'}, page_content='Limitations. FOR BREACH OF ANY PROVISION FOR WHICH AN EXPRESS REMEDY OR MEASURE OF DAMAGES IS PROVIDED, SUCH EXPRESS REMEDY OR MEASURE OF DAMAGES SHALL BE THE SOLE AND EXCLUSIVE REMEDY. A PARTY’S LIABILITY HEREUNDER SHALL BE LIMITED AS SET FORTH IN SUCH PROVISION, AND ALL OTHER REMEDIES OR DAMAGES AT LAW OR IN EQUITY ARE WAIVED. IF NO REMEDY OR MEASURE OF DAMAGES IS EXPRESSLY PROVIDED HEREIN OR IN A TRANSACTION, A PARTY’S LIABILITY SHALL BE LIMITED TO DIRECT ACTUAL DAMAGES ONLY. SUCH DIRECT ACTUAL DAMAGES SHALL BE THE SOLE AND EXCLUSIVE REMEDY, AND ALL OTHER REMEDIES OR DAMAGES AT LAW OR IN EQUITY ARE WAIVED. UNLESS EXPRESSLY HEREIN PROVIDED, NEITHER PARTY SHALL BE LIABLE FOR CONSEQUENTIAL, INCIDENTAL, PUNITIVE, EXEMPLARY OR INDIRECT DAMAGES, LOST PROFITS OR 

In [None]:
documents_metadata_only

[{'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 0},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 1},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 2},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 3},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 4},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 5},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 6},
 {'clause_type': 'limitations',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/limitations.csv',
  'row': 7},
 {'clause_type': 'limitations',


Update

In [None]:
import os
import glob
import pandas as pd
from langchain.docstore.document import Document
path = "/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv"
df = pd.read_csv(path)

documents = []                  # For vector DB: includes page_content + clause_type
documents_metadata_only = []   # For MySQL: only metadata (no content)

for idx, row in df.iterrows():
    if pd.notnull(row['clause_text']) and pd.notnull(row['clause_type']):
        # Document for vector DB
        documents.append(
            Document(
                page_content=row['clause_text'],
                metadata={"clause_type": row['clause_type']}
            )
        )

        # Metadata-only document (for MySQL)
        documents_metadata_only.append({
            "clause_type": row['clause_type'],
            "source": path,
            "row": idx
        })

In [None]:
len(documents_metadata_only)

400

In [None]:
len(documents)

400

In [None]:

for idx, row in df.iterrows():
    if pd.notnull(row['clause_text']) and pd.notnull(row['clause_type']):
        # Document for vector DB
        documents.append(
            Document(
                page_content=row['clause_text'],
                metadata={"clause_type": row['clause_type']}
            )
        )

        # Metadata-only document (for MySQL)
        documents_metadata_only.append({
            "clause_type": row['clause_type'],
            "source": "/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv",  # fixed
            "row": idx
        })


[{'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 0},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 1},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 2},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 3},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 4},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 5},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 6},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 7},
 {'clause_type': 'waiver',
  'source': '/home/ubaid-ur-rehman/Downloads/legal_clause/waiver.csv',
  'row': 8},
 

In [None]:
import os
from dotenv import load_dotenv
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

load_dotenv()

google_api_key = os.getenv("GOOGLE_API_KEY")

# ✅ Use Gemini-compatible embeddings
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-004",  # default Google embedding model
    google_api_key=google_api_key
)

# ✅ Use Gemini Flash model as the LLM
llm_google = ChatGoogleGenerativeAI(
    model="models/gemini-2.0-flash",  # or "models/gemini-1.5-pro" for full version
    temperature=0,
    max_tokens=None,
    google_api_key=google_api_key
)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
llm_google.invoke("hi")

AIMessage(content='Hi there! How can I help you today?', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--f0dd4e8b-77b8-4a90-9e7a-73776fcb0102-0', usage_metadata={'input_tokens': 1, 'output_tokens': 11, 'total_tokens': 12, 'input_token_details': {'cache_read': 0}})

In [None]:
llm.invoke("hi")

AIMessage(content='How can I assist you today?', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 8, 'prompt_tokens': 36, 'total_tokens': 44, 'completion_time': 0.010666667, 'prompt_time': 0.002066503, 'queue_time': 0.048546229, 'total_time': 0.01273317}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_510c177af0', 'finish_reason': 'stop', 'logprobs': None}, id='run--9b86cce2-78f6-402f-a059-970d54e1b695-0', usage_metadata={'input_tokens': 36, 'output_tokens': 8, 'total_tokens': 44})