In [3]:
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
from API.sama_updated import read_docx, process_pdf, convert_to_markdown, extract_text_from_pptx
from pages.imports.searchmethods import qdrant_search,ReciprocalRankFusion,bm25s_search,KO
from API.open import process
from API.florence import process_file_with_florence
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Qdrant
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.llms import Ollama
import tempfile
import bm25s    
import Stemmer

In [4]:
def load_from_string(text: str):
    document = Document(page_content=text, metadata={"source": "string_input"})
    return [document]

def process_documents_with_qdrant(docs, model_name="paraphrase-multilingual-MiniLM-L12-v2"):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", " ", ""])
    split_docs = text_splitter.split_documents(docs)
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    qdrant = Qdrant.from_documents(split_docs, embedding_model, location=":memory:", collection_name="my_documents")
    return qdrant

def init_bm25s_retriever(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ",".", "",]
    )
    split_docs = text_splitter.split_documents(docs)
    corpus = [{'id': i, 'metadata': doc.metadata, 'text': doc.page_content} for i, doc in enumerate(split_docs)]
    stemmer = Stemmer.Stemmer("english")
    texts = [doc['text'] for doc in corpus]
    corpus_tokens = bm25s.tokenize(texts, stopwords="en", stemmer=stemmer)
    retriever = bm25s.BM25()
    retriever.index(corpus_tokens)
    return retriever, corpus, stemmer

In [16]:
import os
import shutil

def chatbot_api(
    file_path,  # Direct file path
    ocr_method,
    search_method,
    prompt
):
    uploads_folder = "uploads"
    os.makedirs(uploads_folder, exist_ok=True)
    
    # Copy the file to uploads folder (optional step if you want to keep a local copy)
    file_dest_path = os.path.join(uploads_folder, os.path.basename(file_path))
    
    try:
        shutil.copy(file_path, file_dest_path)  # Copy file to destination path
    except FileNotFoundError:
        return {"message": f"File not found at the given path: {file_path}"}

    # Verify if the file was successfully copied
    if not os.path.exists(file_dest_path):
        return {"message": f"Failed to copy file to destination path: {file_dest_path}"}

    # Extract content based on file type
    if file_dest_path.endswith(".pdf"):
        content = process_pdf(file_dest_path)
    elif file_dest_path.endswith(".docx"):
        content = read_docx(file_dest_path)
    else:
        return {"message": "Unsupported file format"}

    # OCR processing
    if ocr_method == "openai":
        api_key = os.getenv('OPENAI_API_KEY')
        all_text = process(filename=file_dest_path, api_key=api_key, verbose=True, cleanup=True)
    elif ocr_method == "florence":
        output_folder = os.path.join(uploads_folder, "florence_output")
        ocr_results = process_file_with_florence(file_dest_path, output_folder, verbose=True)
        all_text = " ".join(ocr_results)
    else:  # Default to Tesseract
        all_text = ""
        for item in content:
            all_text += convert_to_markdown(item)

    # Initialize search method
    doc_texts = load_from_string(all_text)
    qdrant = process_documents_with_qdrant(doc_texts) if search_method == "Embedding + Qdrant" else None
    bm25_retriever, bm25_corpus, bm25_stemmer = init_bm25s_retriever(doc_texts) if search_method == "BM25S" else (None, None, None)

    # Get document context using search method
    if search_method == "Embedding + Qdrant":
        doc_context = qdrant_search(prompt, qdrant)
    elif search_method == "BM25S":
        doc_context = bm25s_search(prompt, bm25_retriever, bm25_stemmer, bm25_corpus)
    else:
        return {"message": "Unsupported search method."}

    return {"filename": os.path.basename(file_path), "response": doc_context}

# Example usage in Jupyter Notebook
result = chatbot_api(
    file_path="C:\\Users\\sselva\\Downloads\\KO Documents\\KO Documents\\KS100121_Modify eGroup Approver.docx",
    ocr_method="tesseract",
    search_method="BM25S",
    prompt="how to approve modify egroup request?"
)

print(result)


                                                           

{'filename': 'KS100121_Modify eGroup Approver.docx', 'response': [('**Modify eGroup Approver/** [Size: 304800]\n\n**Administrator/Reviewer** [Size: 304800]\n\n \n\nCheck approval matrix sheet so that we come to know whose approval is required [Color: 201F1E][Size: 139700]\n\nCheck the type of eGroup [Color: 201F1E][Size: 139700]\n\nCheck for the same in approval matrix sheet so as to whose permission is required [Color: 201F1E][Size: 139700]', 1.260368)]}




In [10]:
chatbot_api(file="C:\\Users\\sselva\\Downloads\\KO Documents\\KO Documents\\KS100121_Modify eGroup Approver.docx",ocr_method="tesseract",search_method="BM25S",prompt="how to approve modify egroup request?")

AttributeError: 'str' object has no attribute 'filename'

In [2]:
import os
from langchain_google_genai import ChatGoogleGenerativeAI
os.environ["GOOGLE_API_KEY"] = 'AIzaSyAIWLIxKL1yCk5Skw1WzNZHvPRyK5jhd6g'
llm=ChatGoogleGenerativeAI(model="gemini-1.5-flash")

In [10]:
h=llm.invoke("hi how are you?")
print(h.content)

I am an AI language model, so I don't have feelings or experiences like humans do. But I am here and ready to assist you with any questions or tasks you may have! How can I help you today? 



In [12]:
from API.florence import process_file_with_florence
ocr_results = process_file_with_florence(file_path="C:\\Users\\sselva\\Downloads\\KO Documents\\KO Documents\\KS100121_Modify eGroup Approver.docx", output_folder="uploads", verbose=True)

Processing complete. Cleaned up temporary files.


In [18]:
print(ocr_results[0].get("text"))

Modify eGroup Approver/
Administrator/Reviewer

Check approval matrix sheet so that we come to know whose approval is required
Check the type of eGroup
Check for the same in approval matrix sheet so as to whose permission is required
Navigate to bst.golder.com  eAdministration  Setup  employee group setup  eGroup Type (eTime or eExpense and Supervisor or Administrator)
Find name of that eGroup 
Click on eGroup name and select it
Information will be shown at right side in that update name of Primary or Alternate 1 or Alternate 2 or Alternate 3 as per requirement 
Click on ‘save’ icon



In [1]:
import google
import google.generativeai as genai
import os

genai.configure(api_key='AIzaSyAIWLIxKL1yCk5Skw1WzNZHvPRyK5jhd6g')
model = genai.GenerativeModel("gemini-1.5-flash")
fpath = "C:\\Users\\sselva\\Downloads\\testddoc1.pdf"
with open(fpath, "rb") as f:
    sample_pdf = genai.upload_file(f, mime_type="application/pdf")
response = model.generate_content(["extract all the text word by word from this pdf file.", sample_pdf])
print(response.text)

  from .autonotebook import tqdm as notebook_tqdm


Tesseract at UB Mannheim
The Mannheim University Library (UB Mannheim) uses Tesseract to perform text recognition (OCR = optical character
recognition) for historical German newspapers (Allgemeine Preußische Staatszeitung , Deutscher Reichsanzeiger). The latest
results with text from more than 700000 pages are available online .
Tesseract installer for Windows
Normally we run Tesseract on Debian GNU Linux , but there was also the need for a Windows version . That 's why we have built
a Tesseract installer for Windows .
directory . The uninstaller removes the whole installation directory . If you installed Tesseract in an existing directory , that
directory will be removed with all its subdirectories and files .
The latest installers can be downloaded here :
• tesseract - ocr - w64 - setup - 5.4.0.20240606.exe ( 64 bit )
There are also older versions for 32 and 64 bit Windows available .
In addition , we also provide documentation which was generated by Doxygen .
SUBJECT IN FOCUS : Orig

In [61]:
import boto3
import json
import base64
import pprint
import fitz  # PyMuPDF
import os
import docx  # For DOCX file processing

# Define AWS credentials and setup session
AWS_ACCESS_KEY_ID = "AKIAS54AKEOJBF7WM5WL"
AWS_SECRET_ACCESS_KEY = "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"
REGION_NAME = "us-east-1"

boto3.setup_default_session(
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=REGION_NAME
)

# Initialize Bedrock client
bedrock_client = boto3.client(service_name='bedrock-runtime', region_name=REGION_NAME)

# Function to encode image to base64
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Utility function to extract images from PDF pages
def extract_pages_as_images(pdf_path, output_folder):
    doc = fitz.open(pdf_path)
    images = []
    for page_number in range(len(doc)):
        page = doc.load_page(page_number)
        image_list = page.get_images(full=True)

        for image_index, img in enumerate(image_list):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"{output_folder}/page_{page_number + 1}_image_{image_index + 1}.{image_ext}"
            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)
            images.append(image_filename)  # Collect image paths
    return images

# Utility function to extract text from DOCX files
def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    text_content = [paragraph.text for paragraph in doc.paragraphs]
    return "\n".join(text_content)

# Function to perform OCR using Claude
def perform_ocr_with_claude(image_path):
    base64_image = encode_image(image_path)

    # Define the request payload for Bedrock
    payload = {
        "modelId": "anthropic.claude-3-haiku-20240307-v1:0",
        "contentType": "application/json",
        "accept": "application/json",
        "body": json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 1000,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "image",
                            "source": {
                                "type": "base64",
                                "media_type": "image/jpeg",
                                "data": base64_image
                            }
                        },
                        {
                            "type": "text",
                            "text": "Perform OCR on the image and provide the extracted text."
                        }
                    ]
                }
            ]
        })
    }

    # Make the API call to Bedrock
    response = bedrock_client.invoke_model(**payload)
    response_content = response['body'].read().decode('utf-8')
    response_json = json.loads(response_content)
    
    return response_json['content'][0]['text']

# Main function to process files and perform OCR using Claude
def process_file_with_claude(file_path, output_folder, verbose=False):
    file_extension = os.path.splitext(file_path)[-1].lower()
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    ocr_results = []

    if file_extension == ".pdf":
        # Extract images from PDF and perform OCR
        images = extract_pages_as_images(file_path, output_folder)
        for image_path in images:
            extracted_text = perform_ocr_with_claude(image_path)
            ocr_results.append({"text": extracted_text})

    elif file_extension == ".docx":
        docx_text = extract_text_from_docx(file_path)
        ocr_results.append({"text": docx_text})

        json_output_path = os.path.join(output_folder, "docx_output.json")
        with open(json_output_path, "w") as json_file:
            json.dump({"text": docx_text}, json_file, indent=4)

    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

    if verbose:
        print("Processing complete. Cleaned up temporary files.")

    return ocr_results


In [62]:
process_file_with_claude("C:\\Users\\sselva\\Downloads\\testddoc1.pdf","uploads",verbose=True)

Processing complete. Cleaned up temporary files.


 {'text': 'The text extracted from the image is as follows:\n\nSUBJECT IN FOCUS: Origin of the severe acute respiratory syndrome coronavirus-2 (SARS-CoV-2), the virus causing COVID-19\n\nThe first human cases of COVID-19, the disease caused by the novel coronavirus causing COVID-19, subsequently named SARS-CoV-2 were first reported by officials in Wuhan City, China, in December 2019. Retrospective investigations by Chinese authorities have identified human cases with onset of symptoms in early December 2019.\n\nWhile some of the earliest known cases had a link to a wholesale food market in Wuhan, some did not. Many of the initial patients were either stall owners, market employees, or regular visitors to this market. Environmental samples taken from this market in December 2019 tested positive for SARS-CoV-2, further suggesting that the market in Wuhan City was the source of this outbreak or played a role in the initial amplification of the outbreak. The market was closed on 1 January 

In [37]:
from langchain_openai import AzureChatOpenAI

APIKey = "6527847513f041f1b7b7a30ecf778300"
Endpoint = "https://pstestopenaidply-3wxqngpadhki4.openai.azure.com/"
Deployment = "pstestopenaidply-3wxqngpadhki4"
version = "2024-05-01-preview"

llm = AzureChatOpenAI(
    temperature=.3,
    azure_endpoint=Endpoint,
    api_key=APIKey,
    deployment_name=Deployment,
    openai_api_version=version
)

In [1]:
extracted_text = "Create or Delete eGroup\nCreate eGroup\nCheck approval matrix..."
doc_context = [(extracted_text, 0.07996448453907072)]

In [3]:
from langchain_community.llms import Ollama
llm=Ollama(model="qwen2.5:1.5b")

  llm=Ollama(model="qwen2.5:1.5b")


In [4]:
llm.invoke("hi")

'Hello! How can I help you today?'

In [2]:
process_file_with_gpt_vision("C:\\Users\\sselva\\Downloads\\testddoc1.pdf", "uploads", verbose=True)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: 73c1da94********************8ef0. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [3]:
import os 
import configparser
from langchain_google_genai import ChatGoogleGenerativeAI

config = configparser.ConfigParser()
config.read('config.properties')

os.environ["GOOGLE_API_KEY"] = config['google']['api_key']
llm=ChatGoogleGenerativeAI(model="gemini-1.5-flash")