In [6]:
! pip install PyMuPDF pillow pdfplumber pandas pytesseract





In [7]:
! pip install langchain pypdf pillow pytesseract pdf2image





# PDF pages to image

In [8]:
import fitz  
import os

def convert_pdf_to_images(pdf_path):
    """
    Converts each page of a PDF into JPEG images and saves them in a directory named after the PDF file.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - list: List of image file paths saved.
    """
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = f"{pdf_name}_images"
    os.makedirs(output_dir, exist_ok=True)

    pdf_document = fitz.open(pdf_path)
    saved_image_paths = []
    
    for i, page in enumerate(pdf_document):
        pix = page.get_pixmap()
        image_path = os.path.join(output_dir, f'page{i}.jpg')
        pix.save(image_path)
        saved_image_paths.append(image_path)
    
    return saved_image_paths

pdf_path = "./data/data1.pdf"
saved_paths = convert_pdf_to_images(pdf_path)
print("Images saved to:", saved_paths)


Images saved to: ['data1_images\\page0.jpg', 'data1_images\\page1.jpg', 'data1_images\\page2.jpg', 'data1_images\\page3.jpg', 'data1_images\\page4.jpg', 'data1_images\\page5.jpg', 'data1_images\\page6.jpg', 'data1_images\\page7.jpg', 'data1_images\\page8.jpg', 'data1_images\\page9.jpg', 'data1_images\\page10.jpg']


# extract text from pdf 

In [9]:
import os
import PyPDF2

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - str: Extracted text from the PDF.
    """
    text = ""
    try:
        with open(pdf_path, "rb") as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() or ""
    except Exception as e:
        raise SystemExit(f"Failed to extract text from PDF. Error: {e}")

    return text

def save_text_to_file(text, output_path):
    """
    Save text to a .txt file.

    Args:
    - text (str): Text to save.
    - output_path (str): Path to the output .txt file.
    """
    output_dir = os.path.dirname(output_path)
    if not os.path.exists(output_dir) and output_dir != '':
        os.makedirs(output_dir)
    
    try:
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text)
    except Exception as e:
        raise SystemExit(f"Failed to save text to file. Error: {e}")

pdf_path = "./data/data1.pdf"  
output_path = "extracted/extracted_text.txt"  

extracted_text = extract_text_from_pdf(pdf_path)
save_text_to_file(extracted_text, output_path)

print(f"Extracted text saved to: {output_path}")


Extracted text saved to: extracted/extracted_text.txt


# Get detail description on table , graph , diagram from model

In [10]:
import os
import base64
from langchain_community.llms import Ollama

def process_image_and_return_text(image_path):
    """
    Process an image using LangChain's Ollama model and return the response as text.

    Args:
    - image_path (str): Path to the input image file.

    Returns:
    - str: Text extracted from the image.
    """
    with open(image_path, 'rb') as img_file:
        encoded_image = base64.b64encode(img_file.read()).decode('ascii')

    llm = Ollama(model="llava:13b")

    prompt = """
You are an expert in RAG (Retrieval-Augmented Generation) and NLP tasks. Analyze the following image and provide a concise explanation based on the content:

**For Tables:**
- Describe the table and its purpose.
- List and briefly explain each column header.
- Summarize the data in each row.

**For Charts or Graphs:**
- Identify the chart type and describe the axes and legends.
- Provide a brief analysis of the data trends and key insights.

**For Diagrams:**
- Describe the diagram’s layout and components.
- Summarize the process or workflow depicted.

**General Instructions:**
- Keep the explanation clear and in detail view analysis of each things i mentioned above.
- Focus on the most relevant information related to RAG and NLP tasks.
"""

    try:
        response = llm.generate(prompts=[prompt], images=[encoded_image])
        if response and hasattr(response, 'generations'):
            generations = response.generations
            if generations and generations[0]:
                text = generations[0][0].text if generations[0][0] else "No text available."
                summary_text = text
            else:
                summary_text = "No generations available."
        else:
            summary_text = "Unexpected response format."
    except Exception as e:
        summary_text = f"Failed to process the image. Error: {str(e)}"

    return summary_text

def aggregate_text_from_images(directory, output_file):
    """
    Process all images in the directory, aggregate the text, and append it to a file.

    Args:
    - directory (str): Path to the directory containing images.
    - output_file (str): Path to the file where all text will be appended.
    """
    if not os.path.exists(os.path.dirname(output_file)):
        os.makedirs(os.path.dirname(output_file))

    with open(output_file, 'a') as outfile:  
        for filename in os.listdir(directory):
            if filename.lower().endswith((".jpg", ".png")):
                image_path = os.path.join(directory, filename)
                text = process_image_and_return_text(image_path)
                print(text)
                outfile.write(text + "\n\n")
    
    print(f"All text responses appended to: {output_file}")

image_directory = "./data1_images"  
output_file = "./extracted/extracted_text.txt" 
aggregate_text_from_images(image_directory, output_file)


 The image you've provided is a page from a research paper or technical document related to Retrieval-Augmented Generation (RAG) and Natural Language Processing (NLP) tasks. Here is an analysis based on the visible content:

**For Tables:**
The table titled "Data Split" shows how the data has been divided for training, validation, and testing in a machine learning model. The table is structured with three columns labeled "Train," "Val," and "Test." Each row lists a different split ratio, indicating how much of the dataset will be used for each phase:
  
- "Train": This column shows how many examples from the dataset have been set aside for training. It's likely that this number is determined based on the model's requirements to achieve good performance.
- "Val": The "Validation" column indicates the proportion of data used to evaluate and fine-tune the model during development, which helps ensure that the model generalizes well.
- "Test": The "Test" column reveals how much data will be

# load test from file

In [11]:
file_path = "./extracted/extracted_text.txt"

full_text = ""

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        full_text = file.read()
except UnicodeDecodeError:
    # If 'utf-8' doesn't work, try with 'ISO-8859-1' or 'latin-1'
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        full_text = file.read()

print(full_text)


Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewisyz, Ethan Perez?,
Aleksandra Piktusy, Fabio Petroniy, Vladimir Karpukhiny, Naman Goyaly, Heinrich Küttlery,
Mike Lewisy, Wen-tau Yihy, Tim Rocktäschelyz, Sebastian Riedelyz, Douwe Kielay
yFacebook AI Research;zUniversity College London;?New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge
in their parameters, and achieve state-of-the-art results when ﬁne-tuned on down-
stream NLP tasks. However, their ability to access and precisely manipulate knowl-
edge is still limited, and hence on knowledge-intensive tasks, their performance
lags behind task-speciﬁc architectures. Additionally, providing provenance for their
decisions and updating their world knowledge remain open research problems. Pre-
trained models with a differentiable access mechanism to explicit non-parametric
memory have so far been only investigated for extractive downstream tas

# split text

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)


In [14]:
chunks = text_splitter.split_text(full_text)


In [15]:
print(f"Number of chunks created: {len(chunks)}")
print(f"Sample chunk: {chunks[0]}")

Number of chunks created: 213
Sample chunk: Retrieval-Augmented Generation for
Knowledge-Intensive NLP Tasks
Patrick Lewisyz, Ethan Perez?,
Aleksandra Piktusy, Fabio Petroniy, Vladimir Karpukhiny, Naman Goyaly, Heinrich Küttlery,
Mike Lewisy, Wen-tau Yihy, Tim Rocktäschelyz, Sebastian Riedelyz, Douwe Kielay
yFacebook AI Research;zUniversity College London;?New York University;
plewis@fb.com
Abstract
Large pre-trained language models have been shown to store factual knowledge


In [16]:
docs = [Document(page_content=chunk) for chunk in chunks]


# make vector and store in db

In [17]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [18]:
db2 = Chroma.from_documents(docs, embedding=embedding, persist_directory="./chroma_db/embedding")


In [19]:
db3 = Chroma(persist_directory="./chroma_db/embedding", embedding_function=embedding)


  db3 = Chroma(persist_directory="./chroma_db/embedding", embedding_function=embedding)


In [20]:
retriever = db3.as_retriever(search_kwargs={"k": 6})


In [21]:
from langchain_community.llms import Ollama


In [22]:
llm_text = Ollama(model="llama3")


# cohere reranker

In [23]:
COHERE_API_KEY = "Q1tNApphbMywrTviu1WfdEYa3DfNr8NtwhlGAiYh"


In [24]:
from langchain_cohere import CohereRerank


In [25]:
import getpass
import os
reranker = CohereRerank(cohere_api_key=COHERE_API_KEY)

In [26]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever


# Contextual Compression retrievr

In [27]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [28]:
LLM_extractor = LLMChainExtractor.from_llm(llm_text)

In [29]:
from langchain.retrievers.document_compressors import LLMChainFilter


In [30]:
from langchain.retrievers.document_compressors import LLMChainFilter


In [31]:
_filter = LLMChainFilter.from_llm(llm_text)


In [32]:
from langchain.retrievers.document_compressors import EmbeddingsFilter


In [33]:
embeddings_filter = EmbeddingsFilter(embeddings=embedding, similarity_threshold=0.76)


In [34]:
from langchain.retrievers.document_compressors import DocumentCompressorPipeline


In [35]:
from langchain_community.document_transformers import EmbeddingsRedundantFilter


In [36]:
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding)


In [37]:
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[reranker,LLM_extractor]
)



In [38]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=retriever
)


In [39]:
chain = RetrievalQA.from_chain_type(
    llm_text, retriever=compression_retriever
)

In [40]:
chain.invoke("what is RAG?")

{'query': 'what is RAG?',
 'result': 'Based on the provided context, the answer to the question "what is RAG?" is:\n\n"Retrieval-Augmented Generation"'}

In [41]:
chain.invoke("Explain the differences between the RAG-Sequence and RAG-Token models in the context of Retrieval-Augmented Generation (RAG) and how they handle the retrieved documents during the generation process?")

{'query': 'Explain the differences between the RAG-Sequence and RAG-Token models in the context of Retrieval-Augmented Generation (RAG) and how they handle the retrieved documents during the generation process?',
 'result': 'According to the context, the main difference between RAG-Sequence and RAG-Token models is how they handle the retrieved documents during the generation process.\n\nRAG-Token Model: In this model, a different latent document is drawn for each target token, and the generator produces a distribution for the next output token for each document. This allows the generator to choose content from several documents when producing an answer. The top K documents are retrieved using the retriever, and then the process repeats with the following output token.\n\nRAG-Sequence Model: It seems that RAG-Sequence model does not specify a different latent document for each target token. Instead, it is likely that RAG-Sequence generates its answers in sequence, without considering in

# chat history

In [42]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain


In [43]:
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm_text, compression_retriever, contextualize_q_prompt
)

In [44]:
system_prompt = (
    "You are an expert assistant for answering questions accurately and in detail. "
    "You must base your response strictly on the provided context. "
    "If the context does not contain enough information to answer the question, respond with 'I don't know.' "
    "Avoid using any information not present in the context but use related information from your knowledge. "
    "Ensure your answer is clear and directly addresses the question but give answer in detail.\n\n"
    "Context:\n{context}\n"
    "Question: {input}\n"
    "Answer:"
)


qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm_text, qa_prompt)

In [45]:
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [46]:
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",  
    history_messages_key="chat_history",  
    output_messages_key="answer"  
)

In [47]:
conversational_rag_chain.invoke(
    {"input": "What is RAG?"},
    config={
        "configurable": {"session_id": "abc221"}
    },  
)["answer"]

'Based on the provided context, RAG stands for Retrieval-Augmented Generation.'

In [48]:
store

{'abc221': InMemoryChatMessageHistory(messages=[HumanMessage(content='What is RAG?'), AIMessage(content='Based on the provided context, RAG stands for Retrieval-Augmented Generation.')])}

In [49]:
conversational_rag_chain.invoke(
    {"input": "Give its methods, and other things more in detail?"},
    config={
        "configurable": {"session_id": "abc221"}
    },  
)["answer"]

'Based on the extracted relevant part, it appears that Retrieval-Augmented Generation (RAG) involves incorporating retrieval mechanisms into language generation tasks to improve performance. \n\nFrom what is visible in the image, RAG seems to focus on analyzing various models or approaches, possibly including different methods for retrieving relevant information to augment language generation models.\n\nSome possible methods behind RAG could be:\n\n1. **Retrieval Functions**: This might refer to algorithms that can retrieve relevant information from a vast dataset or corpus, based on certain criteria like similarity in context, relevance, or importance.\n\n2. **Strategies for Integrating Retrieval with Generative Models**: This part likely involves exploring ways to combine the retrieved information with generative models, such as language generators, to produce more accurate and informative outputs.\n\n3. **Potential Applications in Various NLP Tasks**: RAG might be applied to various

In [50]:
conversational_rag_chain.invoke(
    {"input": "what is brain hemridge?"},
    config={
        "configurable": {"session_id": "abc221"}
    },
)["answer"]

"I don't know."