In [5]:
%pip install -U \
  langchain \
  langchain-community \
  langchain-google-genai \
  faiss-cpu \
  pandas \
  dotenv


Note: you may need to restart the kernel to use updated packages.


In [None]:
import os
os.environ["GOOGLE_API_KEY"] =""

In [7]:
import os
import json
import pandas as pd
from dotenv import load_dotenv

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_google_genai import (
    ChatGoogleGenerativeAI,
    GoogleGenerativeAIEmbeddings,
)
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

load_dotenv()


DATA_PATH = "data/legal_cases.csv"
INDEX_DIR = "data/legal_faiss_index"

# LLM + Embeddings (Gemini)
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0.2,
    convert_system_message_to_human=True,
)

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")


In [8]:
def load_legal_cases(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    required_cols = [
        "case_id",
        "case_name",
        "Facts",
        "Issues",
        "Analysis of the Law",
        "Petitioner's Arguments",
        "Respondent's Arguments",
        "Court's Reasoning",
        "Conclusion",
    ]
    for c in required_cols:
        if c not in df.columns:
            raise ValueError(f"Missing column: {c}")

    # avoid NaN
    for c in required_cols:
        df[c] = df[c].fillna("")

    def build_full_text(row):
        parts = [
            f"Case Name: {row['case_name']}",
            "",
            "Facts:",
            row["Facts"],
            "",
            "Issues:",
            row["Issues"],
            "",
            "Analysis of the Law:",
            row["Analysis of the Law"],
            "",
            "Petitioner's Arguments:",
            row["Petitioner's Arguments"],
            "",
            "Respondent's Arguments:",
            row["Respondent's Arguments"],
            "",
            "Court's Reasoning:",
            row["Court's Reasoning"],
            "",
            "Conclusion:",
            row["Conclusion"],
        ]
        return "\n".join(parts)

    df["full_text"] = df.apply(build_full_text, axis=1)

    # you don’t have these, but we add simple defaults
    df["court"] = "Unknown Court"
    df["date"] = "Unknown Date"
    df["url"] = "N/A"
    df["case_id"] = df["case_id"].astype(str)

    return df[["case_id", "case_name", "court", "date", "url", "full_text"]]

df_cases = load_legal_cases(DATA_PATH)
df_cases.head()


Unnamed: 0,case_id,case_name,court,date,url,full_text
0,,,Unknown Court,Unknown Date,,Case Name: \n\nFacts:\n\n\nIssues:\n\n\nAnalys...
1,,,Unknown Court,Unknown Date,,Case Name: \n\nFacts:\n\n\nIssues:\n\n\nAnalys...
2,1.0,Imax Corporation vs E-City Entertainment (I) P...,Unknown Court,Unknown Date,,Case Name: Imax Corporation vs E-City Entertai...
3,,,Unknown Court,Unknown Date,,Case Name: \n\nFacts:\n4. I have collated the ...
4,,,Unknown Court,Unknown Date,,Case Name: \n\nFacts:\nIMAX Limited was incorp...


In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", " ", ""],
)

def build_documents_from_df(df: pd.DataFrame):
    docs = []
    for _, row in df.iterrows():
        base_meta = {
            "case_id": row["case_id"],
            "case_name": row["case_name"],
            "court": row["court"],
            "date": row["date"],
            "url": row["url"],
        }
        chunks = text_splitter.split_text(row["full_text"])
        for i, chunk in enumerate(chunks):
            meta = base_meta.copy()
            meta["chunk_index"] = i
            docs.append(Document(page_content=chunk, metadata=meta))
    return docs

docs = build_documents_from_df(df_cases)
print(f"Total chunks: {len(docs)}")
print(docs[0])


Total chunks: 3640
page_content='Case Name: 

Facts:


Issues:


Analysis of the Law:


Petitioner's Arguments:


Respondent's Arguments:


Court's Reasoning:


Conclusion:' metadata={'case_id': '', 'case_name': '', 'court': 'Unknown Court', 'date': 'Unknown Date', 'url': 'N/A', 'chunk_index': 0}


In [10]:
# from langchain_community.vectorstores import FAISS

# def build_or_load_vectorstore():
#     if os.path.exists(INDEX_DIR):
#         print(f"Loading existing index from {INDEX_DIR}")
#         vs = FAISS.load_local(
#             INDEX_DIR,
#             embeddings,
#             allow_dangerous_deserialization=True,
#         )
#         return vs

#     print("Building new vectorstore...")
#     docs = build_documents_from_df(df_cases)
#     vs = FAISS.from_documents(docs, embeddings)
#     os.makedirs(INDEX_DIR, exist_ok=True)
#     vs.save_local(INDEX_DIR)
#     print(f"Saved index to {INDEX_DIR}")
#     return vs

# vectorstore = build_or_load_vectorstore()
# retriever = vectorstore.as_retriever(search_kwargs={"k": 3})  # top 3 chunks


In [11]:
%pip install -U langchain-core


Note: you may need to restart the kernel to use updated packages.


In [12]:
from langchain_core.embeddings import DeterministicFakeEmbedding

embeddings = DeterministicFakeEmbedding(size=1536)


In [13]:
from langchain_community.vectorstores import FAISS
def build_or_load_vectorstore():
  if os.path.exists(INDEX_DIR):
        print(f"Loading existing index from {INDEX_DIR}")
        vs = FAISS.load_local(
            INDEX_DIR,
            embeddings,
            allow_dangerous_deserialization=True,
        )
        return vs
  docs = build_documents_from_df(df_cases)
  vs = FAISS.from_documents(docs, embeddings)
  os.makedirs(INDEX_DIR, exist_ok=True)
  vs.save_local(INDEX_DIR)
  print(f"Saved index to {INDEX_DIR}")
  return vs

vectorstore = build_or_load_vectorstore()
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})  # top 3 chunks

Saved index to data/legal_faiss_index


In [14]:
query = "tenant eviction without proper notice"
results = retriever.invoke(query)
for i, d in enumerate(results, start=1):
    print(f"\n=== RESULT {i} ===")
    print("Case:", d.metadata["case_name"])
    print("Excerpt:", d.page_content[:300], "...")



=== RESULT 1 ===
Case: 
Excerpt: Case Name: 

Facts:


Issues:


Analysis of the Law:
33. Mr. Khambata has submitted that in any case, the Rules are a piece of subordinate legislation. They cannot be considered to control the provisions of the Act especially if they cause conflict or absurdity in reading of the substantive provisio ...

=== RESULT 2 ===
Case: 
Excerpt: Court's Reasoning:
It further took a specific stand that even if the letter dated 28/11/2000 gave rise to legally binding obligations, the agreement was vitiated by misrepresentation and performance of such agreement has been terminated by abandonment or acquiescence by both the parties.

Conclusion ...

=== RESULT 3 ===
Case: 
Excerpt: . It being a well settled principle, right from the decision of the Apex Court in Renusagar (supra) that violation of Foreign Exchange Act and disregarding the orders of superior courts in India, would be regarded as being contrary to the fundamental policy of Indian Law, since the manda

In [15]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

def format_context(docs):
    lines = []
    for i, d in enumerate(docs, start=1):
        m = d.metadata
        lines.append(
            f"[CASE {i}] {m.get('case_name', 'Unknown')} "
            f"({m.get('court', 'Unknown Court')}, {m.get('date', 'Unknown Date')})\n"
            f"Excerpt:\n{d.page_content}\n"
            "-----"
        )
    return "\n".join(lines)

rag_prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=(
        "You are a legal assistant for Indian case law. You will answer based ONLY "
        "on the provided case excerpts.\n\n"
        "If you are not sure, say you are not sure and suggest consulting a lawyer.\n\n"
        "Context (case excerpts):\n"
        "{context}\n\n"
        "User question:\n"
        "{question}\n\n"
        "Instructions:\n"
        "- Use only the information in the context.\n"
        "- Mention the relevant case names when explaining.\n"
        "- If you add your own general interpretation, mark it clearly.\n"
        "- End with: 'This is not formal legal advice.'\n\n"
        "Answer:\n"
    ),
)

rag_chain = (
    rag_prompt
    | llm
    | StrOutputParser()
)


In [16]:
def ask_legal_assistant(question: str) -> str:
    docs = retriever.invoke(question)

    if not docs:
        return (
            "I could not find any relevant cases in the current dataset for this question. "
            "Please consult a lawyer or a larger legal database. "
            "This is not formal legal advice."
        )

    context = format_context(docs)

    answer = rag_chain.invoke({"question": question, "context": context})
    return answer


In [18]:
questions = [
    "Can a company that owns sound recordings issue public performance licences without being registered as a copyright society under Section 33 of the Copyright Act?",
    "If a tenant is living for more than 10 years and the landlord tries to evict them without notice, what are the tenant’s rights under Indian law?",
]

for q in questions:
    print("QUESTION:", q)
    print()
    print(ask_legal_assistant(q))
    print("\n\n")


QUESTION: Can a company that owns sound recordings issue public performance licences without being registered as a copyright society under Section 33 of the Copyright Act?

Based on the provided case excerpts, there is no information regarding Section 33 of the Copyright Act, copyright societies, or the ability of a company that owns sound recordings to issue public performance licenses.

Therefore, I am not sure and suggest consulting a lawyer.

This is not formal legal advice.



QUESTION: If a tenant is living for more than 10 years and the landlord tries to evict them without notice, what are the tenant’s rights under Indian law?

Based ONLY on the provided case excerpts, there is no information regarding a tenant's rights under Indian law if a landlord tries to evict them without notice after more than 10 years of tenancy. The excerpts discuss:

*   **[CASE 1]**: A show cause notice under Section 25 of the MVAT Act and Rule 30 of the Maharashtra Value Added Tax Rules, 2005, concer

In [20]:
questions = [
    "What limitation period did the Court apply for filing a petition to enforce the foreign arbitral awards in IMAX Corporation vs E-City Entertainment (I) Pvt. Ltd., and was the petition held to be within time?",
    "In the IMAX vs E-City Entertainment case, did the Court consider the absence of RBI/FEMA approval as a ground of “public policy of India” to refuse enforcement of the foreign arbitral awards under Section 48 of the Arbitration and Conciliation Act, 1996?",
]

for q in questions:
    print("QUESTION:", q)
    print()
    print(ask_legal_assistant(q))
    print("\n\n")


QUESTION: What limitation period did the Court apply for filing a petition to enforce the foreign arbitral awards in IMAX Corporation vs E-City Entertainment (I) Pvt. Ltd., and was the petition held to be within time?

Based on the provided case excerpts, there is no information regarding the case of IMAX Corporation vs E-City Entertainment (I) Pvt. Ltd., nor any discussion about the limitation period applied for filing a petition to enforce foreign arbitral awards or whether such a petition was held to be within time.

I am not sure. I suggest consulting a lawyer.

This is not formal legal advice.



QUESTION: In the IMAX vs E-City Entertainment case, did the Court consider the absence of RBI/FEMA approval as a ground of “public policy of India” to refuse enforcement of the foreign arbitral awards under Section 48 of the Arbitration and Conciliation Act, 1996?

Based on the provided case excerpts, there is no information about the "IMAX vs E-City Entertainment" case, nor any discussio