### Script Documentation: Image and Text Processing Pipeline with Azure OCR, OpenAI, and CrateDB

1. Overview

This script provides a complete pipeline for processing documents and images, extracting meaningful text and keywords, storing data in CrateDB, and using Azure OCR and OpenAI's API for generating embeddings and chatbot responses. The key functionality includes:

Extracting images from PDFs

Performing OCR (Optical Character Recognition) using Azure's Computer Vision API

Generating embeddings for text using OpenAI's API

Storing extracted data in CrateDB

Using KNN (K-Nearest Neighbors) search in CrateDB to find relevant context for a user question

Generating responses to user queries based on document content

In [None]:
import os
import time
import re
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from azure.cognitiveservices.vision.computervision.models import VisualFeatureTypes
from msrest.authentication import CognitiveServicesCredentials
from langchain.embeddings.openai import OpenAIEmbeddings
from rake_nltk import Rake
from crate import client
import fitz  
import openai


### Credentials

In [2]:
openai.api_key = os.environ['OPENAI_API_KEY']
AZURE_ENDPOINT = os.environ['AZURE_ENDPOINT']
AZURE_SUBSCRIPTION_KEY = os.environ['AZURE_SUBSCRIPTION_KEY']
CRATEDB_PW = os.environ['CRATEDB_PW']
CRATEDB_URI = os.environ['CRATEDB_URI']
USERNAME = "admin"

In [9]:

computervision_client = ComputerVisionClient(AZURE_ENDPOINT, CognitiveServicesCredentials(AZURE_SUBSCRIPTION_KEY))
embeddings = OpenAIEmbeddings()

### Define necessary functions

In [10]:

# Function to extract images from a PDF using PyMuPDF
def extract_images_with_pymupdf(pdf_path, output_folder):
    """Extract images from a PDF using PyMuPDF and save them."""
    pdf_document = fitz.open(pdf_path)  # Open the PDF document
    image_count = 0

    for page_number in range(len(pdf_document)):
        page = pdf_document[page_number]  # Get the page
        image_list = page.get_images(full=True)  # Get list of images on the page

        for img_index, img in enumerate(image_list):
            xref = img[0]  # Image reference number
            base_image = pdf_document.extract_image(xref)  # Extract the image
            image_bytes = base_image["image"]  # Get the image bytes
            image_ext = base_image["ext"]  # Get the image file extension (e.g., png, jpeg)

            # Create an image file name and save the image
            image_filename = f"page_{page_number + 1}_img_{img_index + 1}.{image_ext}"
            image_path = os.path.join(output_folder, image_filename)
            with open(image_path, "wb") as image_file:
                image_file.write(image_bytes)

            print(f"Saved image: {image_filename}")
            image_count += 1

    pdf_document.close()  # Close the PDF document
    return image_count

#CrateDB connection setup
def get_crate_connection():
    """
    Establish a connection to CrateDB.
    """
    return client.connect(CRATEDB_URI, username=USERNAME, password=CRATEDB_PW, verify_ssl_cert=True)

Function to generate tags
def generate_tags(keywords):
    """
    Generate tags from the most significant keywords.
    """
    return keywords[:5] if len(keywords) > 5 else keywords

# Function to extract the page number from the image name
def extract_page_number(image_name):
    """
    Extract the page number from the image file name.
    Assumes file names follow the pattern 'page_<number>*.png'.
    """
    match = re.search(r'page_(\d+)', image_name)
    if match:
        return int(match.group(1))  # Return the number after 'page_'
    return None  # Return None if no match is found

# Function to clean text
def clean_text(text):
    """Cleans the extracted text."""
    return " ".join(text.split())

# Function to extract keywords using RAKE
def extract_keywords(text):
    """Extract keywords using RAKE."""
    rake = Rake()
    rake.extract_keywords_from_text(text)
    return rake.get_ranked_phrases()

# Function to filter and prioritize keywords
def filter_and_prioritize_keywords(keywords):
    """Filters and prioritizes keywords, removing punctuation and non-relevant words."""
    filtered_keywords = {
        keyword.strip().lower()
        for keyword in keywords
        if len(keyword.split()) >= 2 and len(keyword) > 5 and
        not any(punc in keyword for punc in ["(", ")", "+", "-", "=", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0", "the", "yes", "no"]) 
    }
    return sorted(filtered_keywords, key=lambda k: -len(k))  # Sort by length (longer phrases first))

# Function to analyze an image with Azure OCR and extract keywords
def analyze_image_with_azure(image_path):
    """Extract text and refine keywords."""
    with open(image_path, "rb") as image_file:
        ocr_result = computervision_client.read_in_stream(image_file, raw=True)
    operation_location = ocr_result.headers["Operation-Location"]
    operation_id = operation_location.split("/")[-1]

    while True:
        result = computervision_client.get_read_result(operation_id)
        if result.status not in ["notStarted", "running"]:
            break
        print("Waiting for Azure OCR results...")
        time.sleep(5)

    if result.status == "succeeded":
        extracted_text = ""
        for page in result.analyze_result.read_results:
            for line in page.lines:
                extracted_text += line.text + " "
        clean_extracted_text = clean_text(extracted_text)
        raw_keywords = extract_keywords(clean_extracted_text)

        # If no keywords, attempt single-word extraction
        if not raw_keywords:
            raw_keywords = clean_extracted_text.split()

        # Filter and prioritize keywords (without important_terms)
        final_keywords = filter_and_prioritize_keywords(raw_keywords)
        return clean_extracted_text, final_keywords

    return "", []
# Function to generate image description
def generate_image_description(image_path):
    """
    Generate a description for an image using Azure's Computer Vision API or a similar service.
    """
    try:
        with open(image_path, "rb") as image_file:
            description_result = computervision_client.describe_image_in_stream(image_file)

        if description_result.captions:
            # Select the caption with the highest confidence
            description = max(description_result.captions, key=lambda c: c.confidence).text
            print(f"Generated image description: {description}")
            return description
        else:
            print("No description could be generated for the image.")
            return "No description available."
    except Exception as e:
        print(f"Error generating image description for {image_path}: {e}")
        return "Error generating description."

# Function to store data in CrateDB
def store_data_to_cratedb(page_number, image_name, extracted_text, keywords, tags, embedding_text, embedding_keywords, image_description):
    """
    Store extracted text, keywords, tags, embeddings, and image description into CrateDB.
    """
    try:
        conn = get_crate_connection()
        cursor = conn.cursor()

        # Ensure the table exists
        create_table_query = """
        CREATE TABLE IF NOT EXISTS text_data (
            page_number INT,
            image_name TEXT,
            text TEXT,
            keywords TEXT,
            tags TEXT,
            embedding_text FLOAT_VECTOR(1536),  -- Embedding for the text
            embedding_keywords FLOAT_VECTOR(1536),
            image_description TEXT  -- Description of the image
        );
        """
        cursor.execute(create_table_query)
        conn.commit()

        # Insert data
        insert_query = """
        INSERT INTO text_data (page_number, image_name, text, keywords, tags, embedding_text, embedding_keywords, image_description)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?);
        """
        cursor.execute(
            insert_query,
            (
                page_number,
                image_name,
                extracted_text,
                ", ".join(keywords),  # Convert keywords to a single string
                ", ".join(tags),  # Convert tags to a single string
                embedding_text,  # Embedding vector for the text
                embedding_keywords,  # Embedding vector for the keywords
                image_description  # Description of the image
            ),
        )
        conn.commit()
        print(f"Data from {image_name} stored successfully.")

    except Exception as e:
        print(f"Error storing data to CrateDB: {e}")

### Function execution

In [11]:
# Main processing workflow
try:
    pdf_path = r"../../data/Databricks.pdf"
    #edit the output folder (where the extracted images from pdf will be located!)
    output_folder = r"C:\Users\sami.arem\OneDrive - Solita Oy\Desktop\extracted_images"

    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Step 1: Extract images from PDF
    print("Extracting images from PDF...")
    image_count = extract_images_with_pymupdf(pdf_path, output_folder)
    print(f"Extracted {image_count} images from the PDF.")

    # Step 2: Process each image
    for image_file in sorted(os.listdir(output_folder)):
        if image_file.endswith((".png", ".jpg", ".jpeg")):
            image_path = os.path.join(output_folder, image_file)
            try:
                print(f"\nProcessing image: {image_file}")

                # Extract page number from the image name
                page_number = extract_page_number(image_file)
                if page_number is None:
                    print(f"Could not extract page number from {image_file}. Skipping this image.")
                    continue

                # Extract text and keywords
                extracted_text, keywords = analyze_image_with_azure(image_path)

                # Print extracted text and keywords
                print(f"Extracted Text from {image_file} (Page {page_number}):\n{extracted_text}")
                print(f"Extracted Keywords from {image_file} (Page {page_number}):\n{', '.join(keywords)}")

                # Generate tags
                tags = generate_tags(keywords)
                print(f"Generated Tags: {', '.join(tags)}")

                # Generate embeddings
                embedding_text = embeddings.embed_query(extracted_text)  # Embedding for the text
                keyword_string = ", ".join(keywords) if keywords else ""
                embedding_keywords = embeddings.embed_query(keyword_string)  # Embedding for the keywords

                # Generate image description
                image_description = generate_image_description(image_path)

                # Store data in CrateDB
                store_data_to_cratedb(
                    page_number,
                    image_file,
                    extracted_text,
                    keywords,
                    tags,
                    embedding_text,
                    embedding_keywords,
                    image_description
                )

            except Exception as e:
                print(f"An error occurred while processing {image_file}: {e}. Skipping this image.")
            time.sleep(5)  # Delay between operations

except Exception as e:
    print(f"An unexpected error occurred: {e}")


Extracting images from PDF...
Saved image: page_1_img_1.png
Saved image: page_4_img_1.png
Saved image: page_5_img_1.png
Saved image: page_6_img_1.png
Saved image: page_6_img_2.png
Saved image: page_6_img_3.png
Saved image: page_8_img_1.png
Saved image: page_9_img_1.png
Saved image: page_10_img_1.png
Saved image: page_10_img_2.png
Saved image: page_11_img_1.png
Saved image: page_12_img_1.png
Saved image: page_13_img_1.png
Saved image: page_13_img_2.png
Saved image: page_14_img_1.png
Saved image: page_16_img_1.png
Saved image: page_16_img_2.png
Saved image: page_17_img_1.png
Saved image: page_17_img_2.png
Saved image: page_19_img_1.png
Saved image: page_21_img_1.png
Saved image: page_21_img_2.png
Saved image: page_24_img_1.png
Saved image: page_24_img_2.jpeg
Saved image: page_25_img_1.png
Saved image: page_32_img_1.png
Saved image: page_33_img_1.png
Saved image: page_34_img_1.png
Saved image: page_38_img_1.png
Saved image: page_40_img_1.png
Saved image: page_41_img_1.png
Saved image: pag

KeyboardInterrupt: 

### Chatbot execution

In [10]:
my_question = "What is Unity catalog?"
query_embedding = embeddings.embed_query(my_question)  # Generate embedding for the question

# KNN query to include page numbers in the result
knn_query = """
    SELECT text, page_number
    FROM text_data
    WHERE knn_match(embedding_text, ?, 4)
    ORDER BY _score DESC
    LIMIT 4
"""

# Connect to CrateDB and execute the query
documents = []
page_numbers = []

try:
    conn = get_crate_connection()  # Get CrateDB connection
    cursor = conn.cursor()
    
    # Execute the KNN query with the embedding
    cursor.execute(knn_query, [query_embedding])
    
    # Fetch and process the results
    for record in cursor.fetchall():
        documents.append(record[0])  # Assuming 'text' is the relevant field to return
        page_numbers.append(record[1])  # Extract the page number
    
    # Combine documents and page numbers to generate context
    context = ""
    for doc, page_number in zip(documents, page_numbers):
        context += f"Page {page_number}:\n{doc}\n\n"
    
    # Define the system prompt with instructions and context
    system_prompt = f"""
    You are an expert in Databricks and are answering a user's question based on the following context extracted from multiple sources (images of a document). 
    Your task is to synthesize information from ALL the available images to answer the user's question comprehensively. 

    Key Requirements:
    - Synthesize and combine ALL relevant information from the images that pertains to the question.
    - Your response should provide a clear, detailed and concise answer to the user's question.
    - At the END of your response, include a note specifying the page numbers where the referenced information was extracted. 
      For example, 'The information was extracted from pages X, Y, and Z.'
    - if the provided context does not contain relevant information, respond only with: "I don't know."
    - If the query is not about Databricks, respond with: "I don't know."

    Context (summarized from all images):
    {context}

    User's Question:
    {my_question}
    """

    # Set your OpenAI API key here (ensure you have the key set correctly in your environment variables)
    openai.api_key = os.getenv('OPENAI_API_KEY')  # Ensure this is set in your environment

    # Create the chat completion
    chat_completion = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Specify the model to use
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": my_question},
        ],
    )

    # Extract and print the response
    response = chat_completion['choices'][0]['message']['content']
    print("Chatbot's response:\n", response)

except Exception as e:
    print(f"An error occurred while querying CrateDB: {e}")


Chatbot's response:
 The Unity Catalog in Databricks is a feature that provides access management controls for users/groups, Metastore access controls, and compute resources within workspaces. It assists in user access data, automated auditing, management controls, lineage monitoring, data discovery and classification, and sharing of various data components such as files, tables, notebooks, model registry, and feature store.

In essence, the Unity Catalog in Databricks serves as a centralized platform that facilitates efficient data management, access control, auditing, and sharing functionalities across different data components within the workspace environment.

The information regarding the Unity Catalog was extracted from pages 6 and 62.
