In [1]:
import os

from dotenv import load_dotenv
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma

In [2]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

class TextLoader:
    def __init__(self, filepath):
        self.filepath = filepath

    def load(self):
        with open(self.filepath, 'r', encoding='utf-8') as file:
            # Create Document objects instead of dictionaries
            return [Document(line.strip(), {}) for line in file if line.strip()]

load_dotenv()
current_dir = os.getcwd()
cases_dir = os.path.join(current_dir, "cases")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")

if not os.path.exists(persistent_directory):
    print("Persistent directory does not exist. Initializing vector store...")
    if not os.path.exists(cases_dir):
        raise FileNotFoundError(f"The directory {cases_dir} does not exist. Please check the path.")
    
    case_files = [f for f in os.listdir(cases_dir) if f.endswith(".txt")]
    documents = []
    for file in case_files:
        file_path = os.path.join(cases_dir, file)
        loader = TextLoader(file_path)
        book_docs = loader.load()
        for doc in book_docs:
            doc.metadata['source'] = file
            documents.append(doc)
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)
    print("\n--- Document Chunks Information ---")
    print(f"Number of document chunks: {len(docs)}")
    print("\n--- Creating embeddings ---")
    huggingface_embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    print("\n--- Finished creating embeddings ---")
    print("\n--- Creating and persisting vector store ---")
    db = Chroma.from_documents(
            docs, huggingface_embeddings, persist_directory=persistent_directory)
    print("\n--- Finished creating and persisting vector store ---")
else:
    print("Vector store already exists. No need to initialize.")




Persistent directory does not exist. Initializing vector store...

--- Document Chunks Information ---
Number of document chunks: 957501

--- Creating embeddings ---


  huggingface_embeddings = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



--- Finished creating embeddings ---

--- Creating and persisting vector store ---


MemoryError: 

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage
current_dir = os.getcwd()
persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata")
embeddings=HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
db = Chroma(persist_directory=persistent_directory, embedding_function=embeddings)
legal_retriever = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 3, "score_threshold":0.4},
)
llm=ChatOpenAI(model="gpt-4o")
qa_system_prompt=(
    "You are an expert assissant for a lawyer,Use the following pieces of retrived content to answer the questions.Give a detailed walkin about the query asked and clearly show how the query and information in the query has been used in the retrived documents as well. "
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, legal_retriever, contextualize_q_prompt
)
qa_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever, qa_chain)
def continual_chat():
    print("Start chatting with the AI! Type 'exit' to end the conversation.")
    chat_history = []
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            break
        result = rag_chain.invoke({"input": query, "chat_history": chat_history})
        # Display the AI's response
        print(f"AI: {result['answer']}")
        chat_history.append(HumanMessage(content=query))
        chat_history.append(SystemMessage(content=result["answer"]))

In [None]:
import requests
import json
from googleapiclient.discovery import build
load_dotenv()
GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
CX = os.getenv('CX')
def fetch_news(query,count=10):
    service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
    res = service.cse().list(q=query, cx=CX, num=count).execute()
    return res.get('items', [])
def get_news_articles(query):
    print(f"Searching for articles related to: {query}")
    search_results = fetch_news(query)
    articles = []
    for result in search_results:
        title = result.get('title')
        link = result.get('link')
        snippet = result.get('snippet')
        articles.append({"title": title, "link": link, "snippet": snippet})
    return articles
#write the creation and pf the news db
def embed_and_store_news_articles(articles, persistent_directory_news):
    news_documents = []
    for article in articles:
        news_text = article['title'] + " " + article['snippet']
        news_documents.append(news_text)
        news_db = Chroma.from_documents(
        news_documents, embedding_function=embeddings, persist_directory=persistent_directory_news)
        return news_db
def integrate_news_into_system(query):
    articles = get_news_articles(query)
    current_dir = os.getcwd()
    news_persistent_directory = os.path.join(current_dir, "db", "chroma_db_with_metadata_news")
    if not os.path.exists(persistent_directory):
        print("Persistent directory does not exist. Initializing vector store...")
        if not os.path.exists(cases_dir):
            raise FileNotFoundError(f"The directory {cases_dir} does not exist. Please check the path.")
        news_db = embed_and_store_news_articles(articles, news_persistent_directory)
    return news_db
news_retriever = news_db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 3, "score_threshold": 0.4})


In [None]:
def hybrid_search(query, legal_retriever, news_retriever):
    legal_results = legal_retriever.retrieve(query)
    news_results = news_retriever.retrieve(query)
    combined_results = legal_results + news_results
    return combined_results
combined_results = hybrid_search(query, legal_retriever, news_retriever)
answer = rag_chain.invoke({"input": query, "chat_history": [], "documents": combined_results})
print(answer)

In [None]:
#combined code
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from googleapiclient.discovery import build
import os
import json
from dotenv import load_dotenv
import requests

load_dotenv()
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
CX = os.getenv('CX')
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)
current_dir = os.getcwd()
cases_dir = os.path.join(current_dir, "cases")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")
news_persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata_news")
class Document:
    def __init__(self, page_content, metadata):
        self.page_content = page_content
        self.metadata = metadata

class TextLoader:
    def __init__(self, filepath):
        self.filepath = filepath

    def load(self):
        with open(self.filepath, 'r', encoding='utf-8') as file:
            return [Document(line.strip(), {}) for line in file if line.strip()]
if not os.path.exists(persistent_directory):
    print("Persistent directory does not exist. Initializing vector store...")
    case_files = [f for f in os.listdir(cases_dir) if f.endswith(".txt")]
    documents = []
    for file in case_files:
        file_path = os.path.join(cases_dir, file)
        loader = TextLoader(file_path)
        book_docs = loader.load()
        for doc in book_docs:
            doc.metadata['source'] = file
            documents.append(doc)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = text_splitter.split_documents(documents)
    db = Chroma.from_documents(
        docs, embeddings, persist_directory=persistent_directory)
else:
    print("Vector store already exists. No need to initialize.")

def fetch_news(query, count=10):
    service = build("customsearch", "v1", developerKey=GOOGLE_API_KEY)
    res = service.cse().list(q=query, cx=CX, num=count).execute()
    return res.get('items', [])

def get_news_articles(query):
    search_results = fetch_news(query)
    articles = []
    for result in search_results:
        title = result.get('title')
        link = result.get('link')
        snippet = result.get('snippet')
        articles.append({"title": title, "link": link, "snippet": snippet})
    return articles

def embed_and_store_news_articles(articles, persistent_directory_news):
    news_documents = []
    for article in articles:
        news_text = article['title'] + " " + article['snippet']
        news_documents.append(Document(news_text, {}))
    news_db = Chroma.from_documents(
        news_documents, embeddings, persist_directory=persistent_directory_news)
    return news_db
def integrate_news_into_system(query):
    articles = get_news_articles(query)
    if not os.path.exists(news_persistent_directory):
        print("Persistent directory for news does not exist. Initializing vector store for news...")
        news_db = embed_and_store_news_articles(articles, news_persistent_directory)
    else:
        print("News vector store already exists. No need to initialize.")
        news_db = Chroma(persist_directory=news_persistent_directory, embedding_function=embeddings)
    
    return news_db

# Hybrid search across legal and news documents
def hybrid_search(query, legal_retriever, news_retriever):
    legal_results = legal_retriever.retrieve(query)
    news_results = news_retriever.retrieve(query)
    if pdf.retriver:
        combined_results = legal_results + pdf_results
    else:
        combined_results = legal_results + news_results
    return combined_results

legal_retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.4})
def integrate_news_and_search(query):
    news_db = integrate_news_into_system(query)  # Pass the query dynamically
    news_retriever = news_db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.4})
    combined_results = hybrid_search(query, legal_retriever, news_retriever)
    return combined_results
llm = ChatOpenAI(model="gpt-4")
qa_system_prompt = (
    "You are an expert assistant for a lawyer. Use the following pieces of retrieved content "
    "to answer the questions. Give a detailed explanation and clearly show how the query and "
    "information in the query have been used in the retrieved documents as well."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, just "
    "reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, legal_retriever, contextualize_q_prompt
)

qa_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)

# Chat function
def continual_chat():
    print("Start chatting with the AI! Type 'exit' to end the conversation.")
    chat_history = []
    while True:
        query = input("You: ")
        if query.lower() == "exit":
            break
        combined_results = integrate_news_and_search(query)
        result = rag_chain.invoke({"input": query, "chat_history": chat_history, "documents": combined_results})
        # Display the AI's response
        print(f"AI: {result['answer']}")
        chat_history.append(HumanMessage(content=query))
        chat_history.append(SystemMessage(content=result["answer"]))
if __name__ == "__main__":
    continual_chat()


In [None]:
#if to upload file 
#write for th eupload of the file 
import fitz
def text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text("text")
    return text
def embed_pdf_text_to_store(text, embedding_model, vector_store_path):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    pdf_docs = text_splitter.split_documents([Document(page_content=text, metadata={})])
    db = Chroma.from_documents(
        pdf_docs, embeddings, persist_directory=persistent_directory)
    return db
pdf_retriver=db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.4})
def handle_pdf_qa(query, pdf_retriever):
    pdf_results = pdf_retriever.retrieve(query)
    return pdf_results


In [None]:
def classify_case_outcome(document, query_outcome):
    case_outcome = llm.predict(f"Does this case support or contradict the following outcome: {query_outcome}? \n\nCase: {document.page_content}")
    return case_outcome

def classify(docs,query):
    classifications = []
    for doc in docs:
        outcome = classify_case_outcome(doc, query['legal_position'])
        classifications.append({
            'document': doc,
            'classification': outcome
        })
    return classifications
def identify_contradictions_and_supports(classified_docs, query_outcome):
    contradictory_cases = []
    supported_cases=[]
    for item in classified_docs:
        if item['classification'] == 'Contradictory':
            contradictory_cases.append(item['document'])
        elif item['classification']=='Supportive':
            supported_cases.append(item['document'])
    return contradictory_cases,supported_cases
def supportive_and_contradiction_explanation(contradictions,supports, query):
    supported_explanations = []
    condractive_explanations=[]
    for case in supports:
        support_explanation = llm.predict(f"Explain why this case supports the following legal position: {query['legal_position']}. \n\nCase: {case.page_content}")
        supported_explanations.append(support_explanation)
    for case in contradictions:
        contradict_explanation = llm.predict(f"Explain why this case contradicts the following legal position: {query['legal_position']}. \n\nCase: {case.page_content}")
        condractive_explanations.append(contradict_explanation)
    return supported_explanations,condractive_explanations

def retrive_and_classify_docs_fetched(query):
    # Fetch documents from the database
    docs=legal_retriever
    # Classify the documents
    classified_documents=classify(docs)
    contradictions,supports=identify_contradictions_and_supports(classified_documents,query['legal_position'])
    supportive_explanations,contradiction_explanations = supportive_and_contradiction_explanation(contradictions,supports, query)
    return{
       'Supportive':supports,
       'Supportive_explanations':supportive_explanations,
       'Contradictory': contradictions,
       'contradiction_explanations':contradiction_explanations
    }

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import VectorStoreRetriever
from langchain.chains import LLMChain
import openai
import json
from dotenv import load_dotenv
embedding_model = HuggingFaceEmbeddings()
llm = OpenAI(model="gpt-4")
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)
current_dir = os.getcwd()
cases_dir = os.path.join(current_dir, "cases")
db_dir = os.path.join(current_dir, "db")
persistent_directory = os.path.join(db_dir, "chroma_db_with_metadata")
vector_store = Chroma(persist_directory=persistent_directory, embedding_function=embedding_model)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
load_dotenv()
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
GOOGLE_CSE_ID = os.getenv('CX')
def process_pdf(pdf_path):
    pdf_loader = PDFLoader(file_path=pdf_path)
    pdf_texts = pdf_loader.load_and_split(text_splitter=text_splitter)
    vector_store.add_documents(pdf_texts)
    return pdf_texts
def embed_pdf_text_to_store(pdf_path):
    pdf_texts = process_pdf(pdf_path)
    vector_store.add_documents(pdf_texts)
    print(f"PDF text embedded and added to vector store.")

def retrieve_legal_cases(query):
    retriever = vector_store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 5, "score_threshold": 0.4})
    relevant_cases = retriever.get_relevant_documents(query)
    return relevant_cases

def fetch_news(query):
    import requests
    search_url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={GOOGLE_API_KEY}&cx={GOOGLE_CSE_ID}"
    response = requests.get(search_url)
    articles = response.json().get('items', [])
    news_texts = [article["snippet"] for article in articles]

    news_documents = [{"content": text, "metadata": {"source": "news"}} for text in news_texts]
    vector_store.add_documents(news_documents)
    return news_texts, articles

def integrate_news_and_search(query):
    news_texts, articles = fetch_news(query)
    return news_texts, articles

def classify_case_outcome(case_text, legal_position):
    """Classify if a case is supportive or contradictory to a given legal position."""
    prompt = f"""
    The legal position is: "{legal_position}"
    Based on the given legal position, classify the following case as either 'supportive' or 'contradictory':

    {case_text}

    Respond with just 'supportive' or 'contradictory'.
    """
    classification = llm.predict(prompt)
    return classification.strip()

def classify(retrieved_cases, legal_position):
    """Classify cases into supportive and contradictory categories."""
    supportive_cases = []
    contradictory_cases = []

    for case in retrieved_cases:
        case_text = case['content']
        outcome = classify_case_outcome(case_text, legal_position)

        if outcome == 'supportive':
            supportive_cases.append(case)
        else:
            contradictory_cases.append(case)

    return supportive_cases, contradictory_cases

def generate_case_explanation(cases, classification):
    """Generate explanation for supportive/contradictory cases."""
    prompt = f"""
    The following cases are classified as {classification}:
    
    {cases}

    Provide an explanation of why these cases are {classification}.
    """
    explanation = llm.predict(prompt)
    return explanation


def create_final_response(query, supportive_cases, contradictory_cases, news_articles):
    """Create the final response combining supportive/contradictory cases and news."""
    supportive_explanation = generate_case_explanation(supportive_cases, "supportive")
    contradictory_explanation = generate_case_explanation(contradictory_cases, "contradictory")

    response = f"""
    Query: {query}

    Supportive Cases:
    {supportive_explanation}

    Contradictory Cases:
    {contradictory_explanation}

    Related News:
    {news_articles}
    """

    return response

def handle_query(query, pdf_path=None):
    # Step 1: Handle PDF Upload if applicable
    if pdf_path:
        embed_pdf_text_to_store(pdf_path)

    # Step 2: Retrieve legal cases from the vector store
    retrieved_cases = retrieve_legal_cases(query)

    # Step 3: Fetch related news articles
    news_texts, articles = integrate_news_and_search(query)

    # Step 4: Classify cases into supportive or contradictory categories
    legal_position = query  # In a more complex system, you'd extract the position differently
    supportive_cases, contradictory_cases = classify(retrieved_cases, legal_position)

    # Step 5: Generate the final response
    response = create_final_response(query, supportive_cases, contradictory_cases, articles)
    
    return response

In [None]:
from flask import Flask, request, jsonify
from langchain import rag_chain  # Assuming you have imported and initialized this.
import PyPDF2
app = Flask(__name__)
chat_history = []

@app.route('/chat', methods=['POST'])
def chat():
    data = request.get_json()
    query = data.get('query')

    # Process query using your RAG chain
    result = rag_chain.invoke({"input": query, "chat_history": chat_history})
    chat_history.append({"type": "user", "content": query})
    chat_history.append({"type": "gpt", "content": result['answer']})

    return jsonify({"response": result['answer']})
@app.route('/upload-pdf', methods=['POST'])
def upload_pdf():
    if 'pdf' not in request.files:
        return jsonify({'message': 'No PDF file uploaded'}), 400

    pdf_file = request.files['pdf']
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    
    # Example: Extracting text from the first page
    page = pdf_reader.pages[0]
    text = page.extract_text()

    # Process the text as needed and return a response
    return jsonify({'message': 'PDF uploaded successfully!', 'text': text})

if __name__ == "__main__":
    app.run(debug=True)