In [4]:
import os
import io
import tempfile
from transformers import pipeline
import base64
from PIL import Image
import pytesseract
import fitz  # PyMuPDF for PDF processing
import pdfplumber  # For table extraction from PDFs
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Qdrant
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
import base64
import tempfile

In [5]:
# Set up pytesseract for OCR
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\sselva\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"


In [6]:
llm = Ollama(model="qwen2.5:1.5b")

In [8]:
# Function to perform OCR on images
def perform_ocr(image):
    return pytesseract.image_to_string(image)

# Function to process PDF files and convert them into markdown
def process_pdf(file_path):
    pdf_content = []
    with fitz.open(file_path) as pdf_document:
        for page_number in range(len(pdf_document)):
            page = pdf_document[page_number]
            text = page.get_text("text")
            pdf_content.append({'type': 'paragraph', 'text': text, 'page_number': page_number + 1})

            image_list = page.get_images(full=True)
            for img in image_list:
                xref = img[0]
                base_image = pdf_document.extract_image(xref)
                image_bytes = base_image["image"]
                image_stream = io.BytesIO(image_bytes)
                image = Image.open(image_stream)

                # OCR the image
                ocr_text = pytesseract.image_to_string(image)
                pdf_content.append({'type': 'image', 'text': ocr_text, 'page_number': page_number + 1})

    with pdfplumber.open(file_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            tables = page.extract_tables()
            for table in tables:
                pdf_content.append({'type': 'table', 'data': table, 'page_number': page_number + 1})

    return pdf_content

# Convert extracted content to markdown
def convert_to_markdown(content):
    if content['type'] == 'paragraph':
        return f"{content.get('text', '')}\n\n"
    elif content['type'] == 'table':
        table_md = ""
        for i, row in enumerate(content['data']):  # Convert table data to markdown format
            table_md += "| " + " | ".join(row) + " |\n"
            if i == 0:
                table_md += "|" + "|".join(["---" for _ in row]) + "|\n"
        return table_md + "\n"
    elif content['type'] == 'image':
        return f"[Image OCR: {content.get('text', '')}]\n\n"
    return ""

# Custom function to load a string as a Document object
def load_from_string(text: str):
    document = Document(page_content=text, metadata={"source": "string_input"})
    return [document]

In [13]:

def process_documents_with_qdrant(docs, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
    # Split the document into chunks based on topic structure
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ", ""]
    )
    split_docs = text_splitter.split_documents(docs)
    
    # Embedding with HuggingFace model
    embedding_model = HuggingFaceEmbeddings(model_name)
    # Qdrant for in-memory vector storage
    qdrant = Qdrant.from_embeddings(split_docs,embedding_model, location=":memory:", collection_name="my_documents")
    
    return qdrant


In [9]:
file_path="C:\\Users\\sselva\\Downloads\\florence2\\sama_test 1\\testddoc1.pdf"

In [10]:
content = process_pdf(file_path)

# Convert to markdown
markdown_text = ""
for item in content:
    markdown_text += convert_to_markdown(item)

# Write markdown to file
markdown_file = file_path.replace(".pdf", ".md")
with open(markdown_file, "w", encoding="utf-8") as f:
    f.write(markdown_text)

In [11]:
docs = load_from_string(markdown_text)

In [14]:
Qdrant=process_documents_with_qdrant(docs)

Multiple ONNX files found in 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2': ['onnx/model.onnx', 'onnx/model_O1.onnx', 'onnx/model_O2.onnx', 'onnx/model_O3.onnx', 'onnx/model_O4.onnx', 'onnx/model_qint8_arm64.onnx', 'onnx/model_qint8_avx512.onnx', 'onnx/model_qint8_avx512_vnni.onnx', 'onnx/model_quint8_avx2.onnx'], defaulting to 'onnx/model.onnx'. Please specify the desired file name via `model_kwargs={"file_name": "<file_name>"}`.


TypeError: 'Document' object is not subscriptable

In [16]:
query="what is tesseract?"
def qdrant_search(query,Qdrant):
    search_results = Qdrant.similarity_search_with_score(query)
    return [(doc.page_content, score) for doc, score in search_results]
doc_context = qdrant_search(query,Qdrant)

In [46]:
print(doc_context)

[('[Image OCR: ? Tesseract at UB Mannheim\n\nThe Mannheim University Library (UB Mannheim) uses Tesseract to perform text recognition (OCR = optical character\nrecognition) for historical German newspapers (Allgemeine PreuBische Staatszeitung, Deutscher Reichsanzeiger). The latest\nresults with text from more than 700000 pages are available online.\n\nTesseract installer for Windows', 0.3482387907556797), ('‘The QCD sum rules (QCDSR) is one of the most power-\nful non-perturbative approach, and has been widely used to\nanalyze the mass spectra and the decay behavior of hadrons,\n[35-50]. In recent years, some tasks were carried out by three-\npoint QCDSR, such as the analysis of electroweak and elec-\n‘tromagnetic form factors 12, 51-57], and the strong coupling\nconstants [58-68]. These parameters are very important to\nanalyze the decay process of hadrons. In our previous work,', 0.22397772019376147), (']', 0.2069307255669893), ('(eG), (FG) and (q)(e2G?) are considered in QCD side.\n

In [81]:
prompt="can you summarize the different topics of the pdf for me?"
doc_context = Qdrant.similarity_search(prompt)[0:3]
print(doc_context)

[Document(metadata={'source': 'string_input', '_id': '9615e707968c4f288fcdbcc64fee8609', '_collection_name': 'my_documents'}, page_content='“The layout of this paper is as follows, After introduction in\nSec. I, the radiative decays of the vector heavy-light mesons\nare analyzed inthe framework of SM in Sec. I, and the elec-\ntromagnetic form factor is introduced. In Sec. II, we sys-\ntematically analyze the electromagnetic form factors of vec-\ntor heavy-light meson to pseudoscalar heavy-light meson by\nthe three-point QCDSR, where the contributions of perturba-\ntive part and vacuum condensate including (Gq), @g.Gq),'), Document(metadata={'source': 'string_input', '_id': 'd6ede898583349148fc4e5d87fbd427c', '_collection_name': 'my_documents'}, page_content='(eG), (FG) and (q)(e2G?) are considered in QCD side.\nSec. IV is employed to present the numerical results and dis-\ncussions. Sec. Vis reserved as conclusions. Some important\nfigures are shown in Appendix.'), Document(metadata={'

In [None]:
structured_prompt = f"Document context:\n{doc_context}\n\nQuestion: {prompt}"
response = llm(structured_prompt)

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# Split the document into chunks based on topic structure
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", " ",".", "",]
)
split_docs = text_splitter.split_documents(docs)
# Assuming 'docs' is a list of Document objects
corpus = [{'id': i, 'metadata': doc.metadata, 'text': doc.page_content} for i, doc in enumerate(split_docs)]

In [16]:
import bm25s
import Stemmer

def init_ret(docs):
    # Assuming 'docs' is a list of Document objects
    corpus = [{'id': i, 'metadata': doc.metadata, 'text': doc.page_content} for i, doc in enumerate(docs)]
    # optional: create a stemmer
    stemmer = Stemmer.Stemmer("english")
    # Extracting just the text from the corpus for tokenization
    texts = [doc['text'] for doc in corpus]  # Assuming 'corpus' is a list of dictionaries

    # Now pass the extracted texts to the tokenize function
    corpus_tokens = bm25s.tokenize(texts, stopwords="en", stemmer=stemmer)

    # Create the BM25 model and index the corpus
    retriever = bm25s.BM25()
    retriever.index(corpus_tokens)
    return retriever,stemmer

retreiver,stemmer = init_ret(docs)


resource module not available on Windows


                                                           

In [22]:
query = "what is tesseract?"
def bm25ans(query, retriever, stemmer, corpus):
    query_tokens = bm25s.tokenize(query, stemmer=stemmer)
    print(f"Tokenized query: {query_tokens}")
    
    # Adjust k based on the number of query tokens
    k = min(2, len(query_tokens[0]))  # Ensure k is <= number of tokens
    print(f"Adjusted k: {k}")
    
    # Retrieve top-k results
    results = retriever.retrieve(query_tokens, corpus=corpus, k=k)
    return results

doc_con=bm25ans(query,retreiver,stemmer,corpus)

                                                    

Tokenized query: Tokenized(ids=[[1, 0]], vocab={'tesseract': 0, 'what': 1})
Adjusted k: 1


  response = llm(structured_prompt)


In [53]:
def bm25s_search(query, retriever, stemmer, corpus):
    query_tokens = bm25s.tokenize(query, stemmer=stemmer)
    print(f"Tokenized query: {query_tokens}")
    
    # Adjust k based on the number of query tokens
    k = min(2, len(query_tokens[0]))  # Ensure k is <= number of tokens
    
    # Retrieve top-k results
    results,scores = retriever.retrieve(query_tokens, corpus=corpus, k=k)
    
    # Combine the results with the scores
    scored_results = [(results[i][i]['text'], scores[i][i]) for i in range(len(results))]
    
    return scored_results

re = bm25s_search(query,retreiver,stemmer,corpus)


                                                    

Tokenized query: Tokenized(ids=[[1, 0]], vocab={'tesseract': 0, 'what': 1})


                                                     

In [55]:
import heapq
from typing import List, Tuple, Dict, Any
# Hybrid Search - Reciprocal Rank Fusion
class ReciprocalRankFusion:
    def __init__(self, k: float = 60.0):
        self.k = k

    def fuse(self, ranked_lists, top_n: int = 3):
        item_ranks = {}
        for lst in ranked_lists:
            for rank, (item, score) in enumerate(lst, start=1):
                if item not in item_ranks:
                    item_ranks[item] = [len(ranked_lists) + 1] * len(ranked_lists)
                item_ranks[item][ranked_lists.index(lst)] = rank

        fused_scores = []
        for item, ranks in item_ranks.items():
            fused_score = sum(1 / (rank + self.k) for rank in ranks)
            heapq.heappush(fused_scores, (-fused_score, item))

        # Return top-n results
        return [(item, -score) for score, item in sorted(fused_scores, reverse=True)[:top_n]]

# Fuse results using Reciprocal Rank Fusion
rrf = ReciprocalRankFusion()
doc_context = rrf.fuse([re,ans], top_n=3)


In [56]:
print(doc_context)



In [57]:
structured_prompt = f"Document context:\n{doc_context}\n\nQuestion: {query}"
response = llm(structured_prompt)

In [None]:
def generate_ko(doc_context, prompt):
    ko_prompt = f"""
    You are an AI assistant tasked with generating a Knowledge Object based on the given context and user input.
    Context: '{doc_context}'
    Use this context to generate a detailed KO in the following format:
    
    - Short Description: (Explain the root cause of the problem)
    - Symptoms: (List observable signs or behaviors indicating the issue)
    - Long Description: (Provide a detailed description of the problem or issue in 50 words)
    - Causes: (Identify the factors that led to this issue)
    - Resolution Note: (Give a step-by-step resolution for the problem, covering all scenarios)
    
    Question: '{prompt}'
    """
    response = st.session_state.llm(ko_prompt)
    return response

In [52]:
import ollama

def generate_knowledge_object(doc_context: str, prompt: str) -> str:
    # Dummy function to simulate generating a KO article
    return f"""
    You are an AI assistant tasked with generating a Knowledge Object based on the given context and user input.
    Context: '{doc_context}'
    Use this context to generate a detailed KO in the following format:
    
    - Short Description: (Explain the root cause of the problem)
    - Symptoms: (List observable signs or behaviors indicating the issue)
    - Long Description: (Provide a detailed description of the problem or issue in 50 words)
    - Causes: (Identify the factors that led to this issue)
    - Resolution Note: (Give a step-by-step resolution for the problem, covering all scenarios)
    
    Question: '{prompt}'
    """

def run(model: str, doc_context: str, question: str):
    client = ollama.Client()

    # Initialize conversation with a user query
    messages = [{"role": "user", "content": question}]

    # First API call: Send the query and function description to the model
    response = client.chat(
        model=model,
        messages=messages,
        tools=[
            {
                "type": "function",
                "function": {
                    "name": "generate_knowledge_object",
                    "description": "Generate a Knowledge Object based on context and user input",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "doc_context": {
                                "type": "string",
                                "description": "Document context relevant to the KO generation"
                            },
                            "prompt": {
                                "type": "string",
                                "description": "The user's question or input for generating the KO"
                            }
                        },
                        "required": ["doc_context", "prompt"],
                    },
                },
            }
        ],
    )

    # Add the model's response to the conversation history
    messages.append(response["message"])

    # Check if the model decided to use the provided function
    if not response["message"].get("tool_calls"):
        print("The model didn't use the function. Its response was:")
        print(response["message"]["content"])
        return

    # Process function calls made by the model
    if response["message"].get("tool_calls"):
        available_functions = {
            "generate_knowledge_object": generate_knowledge_object,
        }

        for tool in response["message"]["tool_calls"]:
            function_to_call = available_functions[tool["function"]["name"]]
            function_args = tool["function"]["arguments"]

            # Ensure the necessary arguments are included
            if "doc_context" not in function_args or "prompt" not in function_args:
                print("Missing required arguments for the function call.")
                return

            # Call the function to generate the KO article
            function_response = function_to_call(doc_context=doc_context, prompt=function_args["prompt"])

            # Add function response to the conversation
            messages.append(
                {
                    "role": "tool",
                    "content": function_response,
                }
            )

    # Second API call: Get final response from the model
    final_response = client.chat(model=model, messages=messages)

    print(final_response["message"]["content"])



In [49]:
print(doc_context)

[('[Image OCR: ? Tesseract at UB Mannheim\n\nThe Mannheim University Library (UB Mannheim) uses Tesseract to perform text recognition (OCR = optical character\nrecognition) for historical German newspapers (Allgemeine PreuBische Staatszeitung, Deutscher Reichsanzeiger). The latest\nresults with text from more than 700000 pages are available online.\n\nTesseract installer for Windows', 0.3482387907556797), ('‘The QCD sum rules (QCDSR) is one of the most power-\nful non-perturbative approach, and has been widely used to\nanalyze the mass spectra and the decay behavior of hadrons,\n[35-50]. In recent years, some tasks were carried out by three-\npoint QCDSR, such as the analysis of electroweak and elec-\n‘tromagnetic form factors 12, 51-57], and the strong coupling\nconstants [58-68]. These parameters are very important to\nanalyze the decay process of hadrons. In our previous work,', 0.22397772019376147), (']', 0.2069307255669893), ('(eG), (FG) and (q)(e2G?) are considered in QCD side.\n

In [53]:
# Example usage
if __name__ == "__main__":
    model = "qwen2.5:1.5b"
    doc_context = doc_context
    question = "generate a KO article on tesseract"
    run(model, doc_context, question)

### Short Description:
Tesseract is an optical character recognition (OCR) library designed for use with Google's Cloud Vision API. It's widely used in various applications to convert images into text.

### Symptoms:
- Users are unable to install or run Tesseract on their local machines.
- Attempting to access the latest results of Tesseract fails due to connection issues, leading to a 403 Forbidden error (as per user feedback).
- The installation process for Windows is not clear and requires extensive manual steps.

### Long Description:
Tesseract, developed by Google, is an OCR library that aims to recognize text from images. It’s crucial in fields such as document digitization and image processing where the ability to interpret scanned documents or images into readable text is essential. However, users often face difficulties with installing Tesseract due to several reasons. The installation process involves downloading a ZIP file containing multiple components (such as libraries an