In [4]:
!pip install langchain openai PyPDF2 googletrans

Collecting langchain
  Using cached langchain-0.3.13-py3-none-any.whl (1.0 MB)
Collecting openai
  Using cached openai-1.58.1-py3-none-any.whl (454 kB)
Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Collecting googletrans
  Downloading googletrans-3.0.0.tar.gz (17 kB)
Collecting aiohttp<4.0.0,>=3.8.3
  Using cached aiohttp-3.11.11-cp39-cp39-macosx_10_9_x86_64.whl (468 kB)
Collecting SQLAlchemy<3,>=1.4
  Using cached SQLAlchemy-2.0.36-cp39-cp39-macosx_10_9_x86_64.whl (2.1 MB)
Collecting PyYAML>=5.3
  Using cached PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl (184 kB)
Collecting pydantic<3.0.0,>=2.7.4
  Using cached pydantic-2.10.4-py3-none-any.whl (431 kB)
Collecting langsmith<0.3,>=0.1.17
  Downloading langsmith-0.2.6-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 1.1 MB/s eta 0:00:01
[?25hCollecting async-timeout<5.0.0,>=4.0.0
  Using cached async_timeout-4.0.3-py3-none-any.whl (5.7 kB)
Collecting langchain-core<0.4.0,>=0.3.26
  U

In [6]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.13-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 1.0 MB/s eta 0:00:01
Collecting dataclasses-json<0.7,>=0.5.7
  Using cached dataclasses_json-0.6.7-py3-none-any.whl (28 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0
  Downloading pydantic_settings-2.7.0-py3-none-any.whl (29 kB)
Collecting httpx-sse<0.5.0,>=0.4.0
  Downloading httpx_sse-0.4.0-py3-none-any.whl (7.8 kB)
Collecting marshmallow<4.0.0,>=3.18.0
  Using cached marshmallow-3.23.2-py3-none-any.whl (49 kB)
Collecting typing-inspect<1,>=0.4.0
  Using cached typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting python-dotenv>=0.21.0
  Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Collecting mypy-extensions>=0.3.0
  Using cached mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)
Installing collected packages: mypy-extensions, typing-inspect, python-dotenv, marshmallow, pydantic-settings, httpx-sse, dataclasses-json, langch

In [1]:
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os
from googletrans import Translator

# Simple in-memory cache (for demonstration purposes)
cache = {}

def extract_text_from_pdf(pdf_file):
    """
    Extracts text from a PDF file.

    Args:
        pdf_file: Path to the PDF file.

    Returns:
        A list of Document objects, where each object represents a page of the PDF.
    """
    from PyPDF2 import PdfReader

    with open(pdf_file, "rb") as f:
        reader = PdfReader(f)
        documents = [Document(page_content=page.extract_text()) for page in reader.pages]
    return documents

def translate_text(text):
    """
    Translates the given text to English.

    Args:
        text: The text to be translated.

    Returns:
        The translated text.
    """
    translator = Translator()
    try:
        translated = translator.translate(text, dest='en')
        return translated.text
    except Exception as e:
        print(f"Error translating text: {e}")
        return ""

def create_vector_store(documents):
    """
    Creates a Chroma vector store from a list of documents.

    Args:
        documents: A list of Document objects.

    Returns:
        A Chroma vector store.
    """
    openai_api_key = os.getenv("OPENAI_API_KEY")
    if not openai_api_key:
        raise ValueError("OpenAI API key not found. Please set the 'OPENAI_API_KEY' environment variable.")

    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    return Chroma.from_documents(documents, embeddings)

def get_retrieval_qa_chain(vector_store):
    """
    Creates a RetrievalQA chain with a custom retriever.

    Args:
        vector_store: A Chroma vector store.

    Returns:
        A RetrievalQA chain.
    """
    llm = OpenAI(
        model_name="text-davinci-003",
        temperature=0.7,
        max_tokens=500,
        top_p=0.9,
        frequency_penalty=0.2,
        presence_penalty=0.4
    )

    with open("prompts/summarize_prompt.txt", "r") as prompt_file:
        prompt_string = prompt_file.read()

    prompt_template = PromptTemplate(
        input_variables=["context", "query"],
        template=prompt_string
    )

    def custom_retriever(query, context):
        """
        Retrieves relevant documents from the vector store.

        Args:
            query: The user's query.
            context: Optional context for the search.

        Returns:
            A list of retrieved documents.
        """
        k_values = [5, 10]
        for k in k_values:
            retrieved_docs = vector_store.similarity_search(query, n_results=k)
            if len(retrieved_docs) >= 3:
                return retrieved_docs
        return vector_store.similarity_search(query, n_results=len(vector_store.documents))

    return RetrievalQA.from_chain_type(
        llm=llm,
        retriever=custom_retriever,
        chain_type_kwargs={"prompt": prompt_template}
    )

def process_pdf(pdf_file, user_query="Please summarize the document."):
    try:
        # Step 1: Extract text from the PDF
        documents = extract_text_from_pdf(pdf_file)
        print("Documents Extracted:", documents)

        # Ensure documents are not empty
        if not documents or all(not doc.page_content for doc in documents):
            return "No valid text content found in the PDF.", []

        # Step 2: Translate to English
        translated_docs = [translate_text(doc.page_content) for doc in documents if doc.page_content]
        print("Translated Documents:", translated_docs)

        # Ensure we have translated documents
        if not translated_docs:
            return "No text could be translated.", []

        # Step 3: Convert translated documents into Document objects
        documents_for_store = [Document(page_content=doc) for doc in translated_docs]
        print("Documents for Store:", documents_for_store)

        # Step 4: Create vector store
        cache_key = f"vector_store_{pdf_file}"  # Example cache key based on PDF path
        if cache_key not in cache:
            vector_store = create_vector_store(documents_for_store)
            cache[cache_key] = vector_store
        else:
            vector_store = cache[cache_key]
        print("Vector Store Created:", vector_store)

        # Step 5: Retrieve relevant documents
        relevant_docs = retrieve_relevant_documents(vector_store, user_query) 

        # Step 6: Create context
        context = " ".join(doc.page_content for doc in relevant_docs) 

        # Step 7: Get retrieval QA chain
        qa_chain = get_retrieval_qa_chain(vector_store)

        # Step 8: Generate summary
        result = qa_chain({
            "query": user_query,
            "context": context
        })
        print("QA Chain Result:", result)

        # Step 9: Return the summarized text and sources
        return result["result"], [doc.page_content for doc in result["source_documents"]]

    except Exception as e:
        print(f"Error in process_pdf: {e}")
        return "An error occurred.", []

# Helper function for retrieving relevant documents
def retrieve_relevant_documents(vector_store, query):
    k_values = [5, 10]
    for k in k_values:
        retrieved_docs = vector_store.similarity_search(query, n_results=k)
        if len(retrieved_docs) >= 3:
            return retrieved_docs
    return vector_store.similarity_search(query, n_results=len(vector_store.documents))

# Example usage
if __name__ == "__main__":
    pdf_file_path = "/Users/sandeepkumarpalit/Downloads/Abenteuer_in_der_Sauna.pdf" 
    user_query = "Please summarize the main findings of the research." 

    try:
        summary, sources = process_pdf(pdf_file_path, user_query)
    except ValueError:
        print("Error: The process_pdf function returned an unexpected number of values.")
        summary = None
        sources = []
    except Exception as e:
        print(f"An error occurred: {e}")
        summary = None
        sources = []

    if summary is not None:
        print("Summary:", summary)
        print("Sources:", sources)
    else:
        print("Error generating summary.")

ModuleNotFoundError: No module named 'langchain'