In [1]:
import os 
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import fitz
import io
import base64
import time
from litellm import completion
from PIL import Image
import instructor
from pydantic import BaseModel, Field
from openai import OpenAI
from typing import List
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Configure, Property, DataType
import json

load_dotenv()

* 'fields' has been removed
  from .autonotebook import tqdm as notebook_tqdm


True

In [2]:
print(os.environ["WCD_URL"])
print(os.environ["WCD_API_KEY"])

https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv


In [3]:
def pdf_to_base64_images(pdf_path):
    base64_images = []

    pdf_document = fitz.open(pdf_path)
    
    def process_page(page):
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        buffered = io.BytesIO()
        img.save(buffered, format="PNG")
        return base64.b64encode(buffered.getvalue()).decode()
        
    with ThreadPoolExecutor() as executor:
        pages = [pdf_document[i] for i in range(len(pdf_document))]
        base64_images = list(executor.map(process_page, pages))
    pdf_document.close()
    return base64_images

In [4]:
def process_summary_from_image(base64_str, max_retries=3, retry_delay=5):
    message = [{
    "role": "system",
    "content": """
    You will be given an image containing text. Your task is to accurately transcribe all the text from this image. 
    Pay special attention to names, tables and numbers.

    Follow these steps to complete the task:
    1. Carefully examine the entire image.
    2. Transcribe all visible text exactly as it appears in the image.
    3. If any text is unclear or illegible, do not attempt to guess or fill in information. Instead, indicate unclear text with [unclear] in your transcription.
    4. Pay particular attention to visual elements such as tables, charts, and diagrams. Ensure these are transcribed accurately and in a clear, organized manner.
    5. If the order of information in the image is not clear, think step by step about the logical flow of the content. Arrange the transcribed information in a relevant and coherent order.
    6. Do not add any information that is not present in the image.
    7. Do not include any preamble or explanation about the transcription process in your response.
    8. For Visual Elements:
        a. For tables: Transcribe headers, rows, and columns in a markdown table format, ensuring proper alignment and structure.
        b. For charts or diagrams: Provide a detailed description of the type (e.g., bar chart, flowchart), layout, and any labeled data points.
        Example Markdown Table:
        | Column 1 Header | Column 2 Header | Column 3 Header |
        |---------------- |-----------------|-----------------|
        | Row 1, Cell 1   | Row 1, Cell 2   | Row 1, Cell 3   |
        | Row 2, Cell 1   | Row 2, Cell 2   | Row 2, Cell 3   |
        | Row 3, Cell 1   | Row 3, Cell 2   | Row 3, Cell 3   |
    9. Your response should only contain the transcribed content from the image, organized in a logical manner if necessary.
    10. If you encounter any issues or if the image is not clear enough to transcribe, explain the problem instead of providing a transcription.
    """
    }, {
        "role": "user",
        "content": [{
            "type": "image_url", "image_url": {
                "url": f"data:image/jpeg;base64,{base64_str}"
            }},
            {"type": "text", "text": "Please transcribe all the text from this image, ensuring the data is in markdown format."}
        ]
    }]

    for attempt in range(max_retries):
        try:
            response = completion(
                model="openrouter/google/gemini-flash-1.5",
                messages=message,
                temperature=0.2,
            )
            return response.choices[0].message.content
        except Exception as e:
            if attempt < max_retries - 1:
                print(f"APIError occurred: {str(e)}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"Max retries reached. Unable to process image.")
                return f"Error processing image: {str(e)}"

In [5]:
def process_images_to_summary(base64_images):
    def process_image_with_index(args):
        index, base64_str = args
        processed_summary = process_summary_from_image(base64_str)
        return index, processed_summary

    summary = [None] * len(base64_images)
    with ThreadPoolExecutor(max_workers=30) as executor:
        futures = [executor.submit(process_image_with_index, (i, img)) for i, img in enumerate(base64_images)]
        
        for future in tqdm(as_completed(futures), total=len(base64_images), desc="Processing images"):
            index, processed_summary = future.result()
            summary[index] = processed_summary
    
    return summary


def prepare_data_for_ingestion(summary, file_name):
    contents = []

    for page_num, result in enumerate(summary, start=1):
        page_number = f"### Page Number: [PG:{page_num}]\n ### Source Document: {file_name}\n\n"
        contents.append(page_number + result + "\n\n\n")
    
    return contents

In [6]:
def process_segmentation(contents, segmentation, file_name):
    pages = "\n".join(contents).split("### Page Number:")
    pages = ["### Page Number:" + page for page in pages[1:]] 
    pages = [page.strip() for page in pages if page.strip()]

    data = []
    for section in segmentation:

        page_content = ""
        for page in pages[section['page_range']['start']-1:section['page_range']['end']]:
            page_content += page + "\n\n"
        
        data.append({
            "heading": section['heading'],
            "page_range": section['page_range'],
            "description": section['description'],
            "content": page_content,
            "source": file_name
        })
        
    return data

In [7]:
class PageRange(BaseModel):
    start: int = Field(..., description="Starting page number")
    end: int = Field(..., description="Ending page number")

class Segment(BaseModel):
    heading: str = Field(..., description="The heading or title of the section")
    description: str = Field(..., description="A brief description or summary of the section")
    page_range: PageRange = Field(..., description="Page range for this section")
    
client = instructor.from_openai(OpenAI())

In [8]:
def process_chunk(chunk_data, client):
    i, text_chunk, step, start_page, end_page = chunk_data

    SYSTEM_PROMPT = """
    You are an advanced financial analysis AI designed to segment financial documents for use in a Retrieval-Augmented Generation (RAG) system. Your goal is to analyze and divide the document into precise, meaningful sections that can be easily indexed and retrieved based on user queries.

    Your objective is to divide this document into logical sections based on content and structure, ensuring complete coverage of all pages.

    Instructions:

    1. Read through the entire document carefully.
    2. Identify logical sections based on the broader theme of the content and structure of the document.
    3. For each section:
        a. Determine the section's content and create an appropriate title.
        b. Identify the page number range where the section is located.
        c. Format the section with its title and page citation.
    4. Ensure comprehensive coverage of the entire document, from page 1 through the final page.
    5. The page number in the document is provided at the top of each page in the format "### Page Number: [PG:X]". Always refer to these page numbers in your response to accurately reference the source of information.
    6. Do not rely on any index section or table of contents for page numbers. Always use the  "### Page Number: [PG:X]" format provided at the top of each page for accurate page references.
    7. Output only factual, documentable information with accurate page references.
    8. Make the sectioning as granular as possible, breaking down larger sections into smaller, more specific subsections where appropriate.
    """
    
    USER_PROMPT = f"""
    Please provide a detailed segmentation of the legal document for pages {start_page} to {end_page}.

    Important:
    - The page number in the document is provided at the top of each page in the format "### Page Number: [PG:X]". Always refer to these page numbers in your response to accurately reference the source of information.

    - Do not rely on any index section or table of contents for page numbers. Always use the  "### Page Number: [PG:X]" format provided at the top of each page for accurate page references.
    - **Ensure that no pages are skipped or missed in the segmentation process. Every page must be accounted for in your response.**

    <Document>
        {text_chunk}
    </Document>
    """
    
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": USER_PROMPT}
            ],
            temperature = 0.2,
            response_model=List[Segment]
        )
        response_segments = response

        chunk_segments = []
        for segment in response_segments:
            page_range = segment.page_range
            chunk_segments.append({
                "heading": segment.heading,
                "description": segment.description,
                "page_range": {
                    "start": page_range.start,
                    "end": page_range.end
                }
            })
        return chunk_segments
    
    except Exception as e:
        print(f"Error processing chunk {i//step + 1}: {str(e)}")
        return []


def process_content_instructor(content_array, client, chunk_size=200, overlap=20, max_workers=10):
    all_segmentations = []
    step = chunk_size - overlap

    chunks = []
    total_pages = len(content_array)
    print(f"Total pages: {total_pages}")

    for i in range(0, len(content_array), step):
        chunk_end = min(i + chunk_size, len(content_array))
        text_chunk = content_array[i:chunk_end]
        start_page = (i // step) * (chunk_size - overlap) + 1
        end_page = min(start_page + chunk_size - 1, total_pages)
        
        chunks.append((i, text_chunk, step, start_page, end_page))

    ordered_results = {}

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_chunk, chunk_data,client): chunk_data[0] for chunk_data in chunks}
        
        with tqdm(total=len(chunks), desc="Processing chunks") as pbar:
            for future in as_completed(futures):
                chunk_index = futures[future]
                chunk_segments = future.result()
                ordered_results[chunk_index] = chunk_segments
                pbar.update(1)

    for i in range(0, len(content_array), step):
        if i in ordered_results:
            all_segmentations.extend(ordered_results[i])
    
    return all_segmentations

In [9]:
def classify_document(segment_data):
    SYSTEM_PROMPT = """
    Generate a concise description of the document based on its segments, focusing on main topics and themes.
    """

    USER_PROMPT = f"""
      Document Segments:
      {segment_data}
      Summarize the document in 100 words or less.
    """

    response = completion(
        model="openrouter/google/gemini-flash-1.5",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_PROMPT}
        ]
    )

    return response.choices[0].message.content.strip()

In [10]:
def get_weaviate_client():
    wcd_url = os.environ["WCD_URL"]
    wcd_api_key = os.environ["WCD_API_KEY"]
    print("w url =", wcd_url)
    print("w key =", wcd_api_key)
    openai_api_key = os.environ["OPENAI_API_KEY"]
    voyageai_api_key = os.environ["VOYAGEAI_API_KEY"]

    headers = {
        "X-VoyageAI-Api-Key": voyageai_api_key,
        "X-OpenAI-Api-Key": openai_api_key
    }

    client = weaviate.connect_to_weaviate_cloud(
        cluster_url=wcd_url,
        auth_credentials=Auth.api_key(wcd_api_key),
        headers=headers,
    )

    if client.is_ready():
        print("Weaviate client is ready")
        return client
    else:
        print("Weaviate client is not ready")
        return None

In [11]:
def create_or_get_collection(client, name, generative_model):
    try:
        print("Creating a new collection", name)
        return client.collections.create(
            name = name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_openai(
                    name="heading_description",
                    source_properties=["heading", "description"], 
                    vector_index_config=Configure.VectorIndex.hnsw()
                ),
            ],
            properties=[ 
                Property(name="heading", data_type=DataType.TEXT),
                Property(name="description", data_type=DataType.TEXT),
            ],
            generative_config=Configure.Generative.openai(model=generative_model)
            )

    except weaviate.exceptions.UnexpectedStatusCodeException:
        print("Collection already exists")
        return client.collections.get(name)

In [12]:
def add_data_to_collection(collection, data):
    for document in data:
        try:
            heading = document['heading']
            content = document['content']
            page_range = document['page_range']
            description = document['description']
            source = document['source']

            embed_client = OpenAI()

            def get_embedding(text):
                response = embed_client.embeddings.create(
                    input= text,
                    model="text-embedding-3-small"
                )
                return response.data[0].embedding                

            combined_text = heading + " " + description
            combined_em = get_embedding(combined_text)

            uuid = collection.data.insert(
                properties={
                    "heading": heading,
                    "description": description,
                    "page_range": page_range,
                    "content": content,
                    "source": source
                },
                vector={
                    "heading_description": combined_em,
                }
            )
        except Exception as e:
            print(f"Error inserting document: {str(e)}")

    print(f"All documents have been added to the {collection.name} collection.")

In [29]:
## Single File Flow

# data_folder_path = os.path.join(os.getcwd(), 'Data')
# file_name = "3M_2021_10K.pdf"

# pdf_path = os.path.join(data_folder_path, file_name)
# base64_images = pdf_to_base64_images(pdf_path)
# len(base64_images)
# summary = process_images_to_summary(base64_images)


# for page_num, result in enumerate(summary, start=1):
#     print(f"Page Number: {page_num}")
#     print(result)
#     print("*****")
#     print("\n\n")

# contents = prepare_data_for_ingestion(summary, file_name)

# segments = process_content_instructor(contents, client, chunk_size=20, overlap=1, max_workers=10)
# len(segments)

# final_data = process_segmentation(contents, segments, file_name)
# len(final_data)

# formatted_segments = []
# for i, segment in enumerate(segments):
#     print(f"Segment {i+1}:")
#     print(f"Heading: {segment['heading']}")
#     print(f"Description: {segment['description']}")
#     print(f"Pages: {segment['page_range']['start']}-{segment['page_range']['end']}")
#     print("\n")
#     formatted_segment = f"Segment {i+1}:\nHeading: {segment['heading']}\nDescription: {segment['description']}\nPages: {segment['page_range']['start']}-{segment['page_range']['end']}\n"
#     formatted_segments.append(formatted_segment)
    
# formatted_segments = '\n'.join(formatted_segments)

# doc_classification = classify_document(formatted_segments)
# print(doc_classification)

# workspace_name = "Test2"

# client_weaviate = get_weaviate_client()
# collection = create_or_get_collection(client_weaviate, workspace_name, "gpt-4o-mini")
# add_data_to_collection(collection, final_data)


In [None]:
"""
ACTIVISIONBLIZZARD = 9
ADOBE = 9
AMAZON = 8
AMCOR = 11
AMD = 9
AMERICANEXPRESS = 10
APPLE = 9
BESTBUY = 14
BLOCK = 9
BOEING = 8
BOSTONPROPERTIES = 1
COCACOLA = 8
CORNING = 9
COSTCO = 13
CVSHEALTH = 8
EBAY = 13
FEDEX = 2
FOOTLOCKER = 8
GENERALMILLS = 10
INTEL = 13
JOHNSON_JOHNSON = 13
JPMORGAN = 10
KRAFTHEINZ = 8
LOCKHEEDMARTIN = 13
MCDONALDS = 7
MGMRESORTS = 12
MICROSOFT = 9
NETFLIX = 9
NIKE = 9
ORACLE = 9
PAYPAL = 2
PEPSICO = 13
PFIZER = 10
PG_E = 13
SALESFORCE = 6
ULTABEAUTY = 8
VERZON = 8
WALMART = 11
"""

In [15]:
data_folder_path = os.path.join(os.getcwd(), 'Data')

pdf_files = [f for f in os.listdir(data_folder_path) if "AMAZON" in f.upper()]
print(len(pdf_files))

8


In [14]:
for file_name in pdf_files:
    print(f"Processing {file_name}")
    pdf_path = os.path.join(data_folder_path, file_name)

    base64_images = pdf_to_base64_images(pdf_path)
    summary = process_images_to_summary(base64_images)
    contents = prepare_data_for_ingestion(summary, file_name)

    segments = process_content_instructor(contents, client, chunk_size=20, overlap=1, max_workers=10)
    print(len(segments))

    formatted_segments = []
    for i, segment in enumerate(segments):
        formatted_segment = f"Segment {i+1}:\nHeading: {segment['heading']}\nDescription: {segment['description']}\nPages: {segment['page_range']['start']}-{segment['page_range']['end']}\n"
        formatted_segments.append(formatted_segment)
    formatted_segments = '\n'.join(formatted_segments)
    doc_classification = classify_document(formatted_segments)

    final_data = process_segmentation(contents, segments, file_name)
    print(len(final_data))

    workspace_name = "Test2"
    client_weaviate = get_weaviate_client()
    collection = create_or_get_collection(client_weaviate, workspace_name, "gpt-4o-mini")
    add_data_to_collection(collection, final_data)
    client_weaviate.close()

    json_file_path = 'file_data.json'
    try:
        if os.path.exists(json_file_path):
            with open(json_file_path, 'r') as json_file:
                existing_data = json.load(json_file)
        else:
            existing_data = []

        existing_data.append({
            "file_name": file_name,
            "description": doc_classification
        })

        with open(json_file_path, 'w') as json_file:
            json.dump(existing_data, json_file, indent=4)

    except Exception as e:
        print(f"Error writing to JSON file: {str(e)}")

Processing ADOBE_2015_10K.pdf


Processing images: 100%|██████████| 116/116 [01:04<00:00,  1.78it/s]


Total pages: 116


Processing chunks: 100%|██████████| 7/7 [03:14<00:00, 27.74s/it]


113
113
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2016_10K.pdf


Processing images: 100%|██████████| 112/112 [01:13<00:00,  1.53it/s]


Total pages: 112


Processing chunks: 100%|██████████| 6/6 [01:09<00:00, 11.58s/it]


97
97
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2017_10K.pdf


Processing images: 100%|██████████| 107/107 [00:56<00:00,  1.90it/s]


Total pages: 107


Processing chunks: 100%|██████████| 6/6 [01:03<00:00, 10.56s/it]


83
83
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2018_10K.pdf


Processing images: 100%|██████████| 112/112 [00:36<00:00,  3.08it/s]


Total pages: 112


Processing chunks: 100%|██████████| 6/6 [00:19<00:00,  3.32s/it]


94
94
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2019_10K.pdf


Processing images: 100%|██████████| 113/113 [00:55<00:00,  2.04it/s]


Total pages: 113


Processing chunks: 100%|██████████| 6/6 [03:28<00:00, 34.71s/it]


74
74
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2020_10K.pdf


Processing images: 100%|██████████| 117/117 [00:38<00:00,  3.04it/s]


Total pages: 117


Processing chunks: 100%|██████████| 7/7 [00:27<00:00,  3.99s/it]


108
108
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2021_10K.pdf


Processing images: 100%|██████████| 102/102 [00:57<00:00,  1.78it/s]


Total pages: 102


Processing chunks: 100%|██████████| 6/6 [05:42<00:00, 57.07s/it] 


106
106
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2022Q2_10Q.pdf


Processing images: 100%|██████████| 56/56 [00:21<00:00,  2.65it/s]


Total pages: 56


Processing chunks: 100%|██████████| 3/3 [00:42<00:00, 14.32s/it]


58
58
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
Processing ADOBE_2022_10K.pdf


Processing images: 100%|██████████| 99/99 [00:32<00:00,  3.08it/s]


Total pages: 99


Processing chunks: 100%|██████████| 6/6 [00:18<00:00,  3.03s/it]


82
82
w url = https://yr92ueborkapz4zbetphtg.c0.asia-southeast1.gcp.weaviate.cloud
w key = G6oHY9f1hQUOVe0LS2RtS5d2THNtKg22VJzv
Weaviate client is ready
Creating a new collection Test2
Collection already exists
All documents have been added to the Test2 collection.
