In [None]:
import fitz  # PyMuPDF
from PIL import Image
import base64
import mimetypes
from huggingface_hub import InferenceClient
import pandas as pd
import json
import os
import shutil

hf_token = ""

ocr_instruction_text = """


You are an OCR expert. Analyze the provided image and extract all invoice-related table data (e.g., item details, descriptions, quantities, prices, totals).  
The table format may vary and may have merged cells or irregular layouts.

Return only valid JSON in this structure:
{
  "invoice_tables": [
    {
      "headers": [string, ...],
      "rows": [
        [string, ...]
      ]
    }
  ]
}

Rules:
- Preserve text exactly as seen, fixing obvious OCR errors when confident.
- Do not add extra commentary or formatting.
- If no table is found, return: {"invoice_tables": []}

"""




def delete_image_folder():
    folder_path = 'images'  # path to your folder

    # Check if folder exists
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        # Iterate over all files and folders inside
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.remove(file_path)  # remove file or symlink
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)  # remove folder and its contents
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')

    # else:
    #     print("Folder does not exist.")

def convert_to_images(pdf_path):
    # pdf_path = r"D:\ocr\page.pdf"
    delete_image_folder()
    doc = fitz.open(pdf_path)

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap(dpi=300)  # High DPI for better quality
        output = f"images\page_{page_num + 1}.png"
        pix.save(output)

    pg = "images/page_"
    n = len(doc)
    ids = []
    for i in range(0,n):
        
        k=str(i+1)
        image = Image.open(pg+k+".png")
        ids.append(pg+k+".png")
    return ids



def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        image_data = image_file.read()
        base64_string = base64.b64encode(image_data).decode('utf-8')
        mime_type, _ = mimetypes.guess_type(image_path)
        mime_type = mime_type or "image/jpeg"
        return f"data:{mime_type};base64,{base64_string}"



def ocr(local_image_paths):
    # Ensure we have a list
    if isinstance(local_image_paths, str):
        local_image_paths = [local_image_paths]
    
    # Convert each image to base64
    image_contents = []
    for img_path in local_image_paths:
        base64_image = image_to_base64(img_path)
        image_contents.append({
            "type": "image_url",
            "image_url": {"url": base64_image}
        })
    
    client = InferenceClient(api_key=hf_token)

    completion = client.chat.completions.create(
        model="Qwen/Qwen2.5-VL-7B-Instruct",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": ocr_instruction_text},
                    *image_contents  # unpack multiple image objects
                ]
            }
        ]
    )

    return completion.choices[0].message["content"]

def get_df(data):
    table = data['invoice_tables'][0]

    # Convert to DataFrame
    df = pd.DataFrame(table['rows'], columns=table['headers'])
    return df


def create_json(ans):
    s = ans

    index = s.find('{')
    if index != -1:
        s = s[index:]  # keep from first '{' onwards


    index = s.rfind('}')  # find last occurrence of '}'
    if index != -1:
        s = s[:index+1]  # keep up to last '}'
    data = json.loads(s)


    return data

def get_excel(df):
    df.to_excel('output/result.xlsx', index=False)  # index=False to avoid writing row numbers
    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

pdf_path = "data.pdf"

#convert pdf to images
ids = convert_to_images(pdf_path)

#extract text from tables in form of json
ans = ocr(ids)

#create json from the ocr text
ans = create_json(ans)

#create dataframe from the json
ans = get_df(ans)

#converted and stored as excel
get_excel(ans)

In [3]:
!pip install -q openpyxl


[notice] A new release of pip available: 22.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
ans

Unnamed: 0,ID,Name,Description
0,1,Item 1,This is a description for item 1 with some ext...
1,2,Item 2,This is a description for item 2 with some ext...
2,3,Item 3,This is a description for item 3 with some ext...
3,4,Item 4,This is a description for item 4 with some ext...
4,5,Item 5,This is a description for item 5 with some ext...
5,6,Item 6,This is a description for item 6 with some ext...
6,7,Item 7,This is a description for item 7 with some ext...
7,8,Item 8,This is a description for item 8 with some ext...
8,9,Item 9,This is a description for item 9 with some ext...
9,10,Item 10,This is a description for item 10 with some ex...
