<a href="https://colab.research.google.com/github/sagarj1209/Legal_chatbot_RAG/blob/master/Legal_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install langchain faiss-cpu transformers chromadb streamlit pypdf



In [2]:
import json

# Load the 3 legal documents
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

# Upload your JSON files
file_paths = ["/content/constitution_qa.json", "/content/crpc_qa.json", "/content/ipc_qa.json"]  # Change these paths accordingly

# Load all documents into a list
legal_data = []
for file_path in file_paths:
    data = load_json(file_path)
    legal_data.extend(data)  # Assuming each document is a list of Q&A pairs

# Display the first few records
legal_data[:4]


[{'question': 'What is India according to the Union and its Territory?',
  'answer': 'India, that is Bharat, shall be a Union of States.'},
 {'question': 'How is India, that is Bharat, defined in terms of its political structure?',
  'answer': 'India, that is Bharat, is defined as a Union of States according to the Union and its Territory.'},
 {'question': 'What does the territory of India comprise of?',
  'answer': 'The territory of India shall comprise the territories of the States, the Union territories specified in the First Schedule, and such other territories as may be acquired.'},
 {'question': 'What does the territory of a country, such as India, comprise of, according to their constitutional provisions?',
  'answer': 'The territory of a country like India comprises the territories of the States, the Union territories specified in the First Schedule, and such other territories as may be acquired.'}]

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# Split each answer into smaller chunks
doc_chunks = []
for item in legal_data:
    answer_chunks = text_splitter.split_text(item['answer'])
    doc_chunks.extend(answer_chunks)

# Check how many chunks have been created
print(f"Total chunks created: {len(doc_chunks)}")

Total chunks created: 14691


In [4]:
!pip install -U langchain-community



In [17]:
from langchain.embeddings import HuggingFaceEmbeddings

# Use the sentence-transformers model
embeddings = HuggingFaceEmbeddings(model_name="nlpaueb/legal-bert-base-uncased")

# # Generate embeddings for the text chunks
# chunk_embeddings = embeddings.embed_documents(doc_chunks)

# # Display a sample embedding
# print(f"Sample embedding: {chunk_embeddings[0]}")

# Create FAISS Vector Store using LangChain
faiss_vectorstore = FAISS.from_texts(doc_chunks, embeddings)

# Save FAISS index correctly as a directory
faiss_vectorstore.save_local("/content/legal_faiss")



In [18]:
# import faiss
# import numpy as np
# from langchain.vectorstores import FAISS

# # Convert the embeddings to numpy array
# embedding_matrix = np.array(chunk_embeddings)

# # Create the FAISS index
# dimension = embedding_matrix.shape[1]  # Dimensionality of embeddings
# faiss_index = faiss.IndexFlatL2(dimension)  # Using L2 distance for similarity search

# # Add embeddings to FAISS index
# faiss_index.add(embedding_matrix)

# # Save the FAISS index for future use
# faiss.write_index(faiss_index, "/content/legal_faiss.index")

In [19]:
#Load FAISS Index Properly
faiss_vectorstore = FAISS.load_local("/content/legal_faiss", embeddings, allow_dangerous_deserialization=True)

In [20]:
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

# Load the language model for text generation (Flan-T5)
llm_pipeline = pipeline("text2text-generation", model="google/flan-t5-small", device=-1)
llm = HuggingFacePipeline(pipeline=llm_pipeline)

# # Set up the retriever to use FAISS index
# from langchain.vectorstores import FAISS
# faiss_vectorstore = FAISS.load_local("/content/legal_faiss.index", embeddings, allow_dangerous_deserialization=True)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(llm, retriever=faiss_vectorstore.as_retriever())

print("RetrievalQA Chain is ready! 🚀")

Device set to use cpu


RetrievalQA Chain is ready! 🚀


In [21]:
import streamlit as st

# Initialize the chatbot function
def chatbot(query):
    response = qa_chain.run(query)
    return response

# Streamlit app interface
st.title("Legal Document Chatbot")
st.write("Ask any legal question based on the uploaded documents.")

user_input = st.text_input("Enter your question:")

if user_input:
    response = chatbot(user_input)
    st.write("Answer:", response)

2025-02-06 17:48:16.790 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-02-06 17:48:16.808 Session state does not function when running a script without `streamlit run`


In [25]:
!pip install streamlit
!pip install pyngrok



In [69]:
%%writefile app.py
import streamlit as st

# Import your RAG pipeline and necessary modules
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import pipeline

# Load the embeddings and vector database
@st.cache_resource
def load_vector_store():
    return FAISS.load_local("/content/legal_faiss", HuggingFaceEmbeddings(model_name="nlpaueb/legal-bert-base-uncased"),allow_dangerous_deserialization=True)

vector_store = load_vector_store()

# Load the language model
@st.cache_resource
def load_llm():
    llm_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")
    return HuggingFacePipeline(pipeline=llm_pipeline)

llm = load_llm()


# Create a RAG-based Q&A chain
qa_chain = RetrievalQA.from_chain_type(llm, retriever=vector_store.as_retriever())

# Initialize the chatbot function
def chatbot(query):
    response = qa_chain.run(query)
    return response

# Streamlit app interface
st.title("Legal Document Chatbot")
st.write("Ask any legal question based on the uploaded documents.")

user_input = st.text_input("Enter your question:")

if user_input:
    response = chatbot(user_input)
    st.write("Answer:", response)

Overwriting app.py


In [70]:
from google.colab import userdata
NGROK_TOKEN = userdata.get('NGROK_TOKEN')

In [71]:
# Authenticate ngrok with your token
!ngrok authtoken $NGROK_TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [72]:
!streamlit run app.py &>/dev/null &

In [76]:
from pyngrok import ngrok

# Kill existing tunnels if re-running
ngrok.kill()

# Open a new tunnel using http://localhost:8501
public_url = ngrok.connect("8501", "http", bind_tls=True)
print(f"Streamlit app is running at: {public_url}")

Streamlit app is running at: NgrokTunnel: "https://94bb-34-125-50-110.ngrok-free.app" -> "http://localhost:8501"
