In [1]:
# requirements for this example:
%pip install -qq docling docling-core python-dotenv langchain-text-splitters langchain-huggingface 

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install -qU "langchain-chroma>=0.1.2"

Note: you may need to restart the kernel to use updated packages.


In [3]:
import os

from dotenv import load_dotenv

load_dotenv()

True

In [4]:
from typing import Iterator

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document as LCDocument

from docling.document_converter import DocumentConverter

class DoclingPDFLoader(BaseLoader):

    def __init__(self, file_path: str | list[str]) -> None:
        self._file_paths = file_path if isinstance(file_path, list) else [file_path]
        self._converter = DocumentConverter()

    def lazy_load(self) -> Iterator[LCDocument]:
        for source in self._file_paths:
            dl_doc = self._converter.convert(source).document
            text = dl_doc.export_to_markdown()
            yield LCDocument(page_content=text)

In [5]:
FILE_PATH = "invoice_1item (1).pdf"  # DocLayNet paper

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = DoclingPDFLoader(file_path=FILE_PATH)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
)

In [7]:
docs = loader.load()
splits = text_splitter.split_documents(docs)

  from .autonotebook import tqdm as notebook_tqdm
Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 117232.10it/s]




In [8]:
docs

[Document(metadata={}, page_content='<!-- image -->\n\n## Sold By :\n\nASHVINKUMAR BHAGVANBHAI GOVINDA\n\n$^{*}$ Plot no. 120 X and part portion of plot no. 119 W2, Gallops Industrial Park 1, Village Rajoda, Taluka Bavla, District Ahmedabad Ahmedabad, GUJARAT, 382220 IN\n\nPAN No:\n\nCIUPG1692Q\n\n24CIUPG1692Q1ZF\n\nGST Registration No:\n\nOrder Number:\n\n403-9087323-7735518\n\nInvoice Number :\n\nAMD2-28157\n\nOrder Date:\n\n05.03.2023\n\nInvoice Details :\n\nGJ-AMD2-1369143095-2223\n\nTOTAL:\n\nInvoice Date :\n\n05.03.2023\n\n| Sl. No Description   |                                                                                                                                                                                                                              | Unit Price   | Qty     | Net Amount   | Tax Rate   | Tax Type   | Tax Amount   | Total Amount   |\n|----------------------|----------------------------------------------------------------------------------------------

In [9]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

HF_EMBED_MODEL_ID = "BAAI/bge-small-en-v1.5"
embeddings = HuggingFaceEmbeddings(model_name=HF_EMBED_MODEL_ID)

In [10]:
from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv
load_dotenv()
import os


HF_API_KEY = os.environ.get("HF_API_KEY")
HF_LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=HF_LLM_MODEL_ID,
    huggingfacehub_api_token=HF_API_KEY,
)

In [11]:
%pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [12]:
%pip install langchain-huggingface

Note: you may need to restart the kernel to use updated packages.


In [13]:
%pip install sentencepiece transformers

Note: you may need to restart the kernel to use updated packages.


In [14]:
from datetime import datetime
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from typing import List
from pydantic import BaseModel, Field
from langchain_community.utils.openai_functions import convert_pydantic_to_openai_function
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
load_dotenv()

from langchain_huggingface import ChatHuggingFace, HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    device=0,
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    ),
)

chat = ChatHuggingFace(llm=llm)

class Product(BaseModel):
    """Information about a Product."""
    Name: str = Field(None,description="Product Name of a product")
    Price: float = Field(None,description="Total amount of the product")
    
class Extraction_with_products(BaseModel):
    """ Extract the entities from the invoice document"""
    transaction_id :str =Field(None,description="Payment Transaction ID of the invoice document file")
    amount : float = Field(None,description="Total Invoice value of the invoice document file")
    datetime_field: datetime = Field(None,description="The date and time (24-hour format) when the payment was made.")
    mode_of_payment :str=Field(None,description="Tells about the mode of payment the user proceeeded")
    Products:List[Product] =Field(None,description="Tells about the details of list of products")

class Extraction(BaseModel):
    """ Extract the entities from the invoice document"""
    transaction_id :str =Field(None,description="Payment Transaction ID of the invoice document file")
    amount : float = Field(None,description="Total Invoice value of the invoice document file")
    datetime_field: datetime = Field(None,description="The date and time (24-hour format) when the payment was made.")
    mode_of_payment :str=Field(None,description="Tells about the mode of payment the user proceeeded") 
    
     
def extract_features(document):


    prompt_template="""You are tasked with extracting specific fields from an invoice document. The fields you need to extract are:

    1. Transaction ID
    2. Amount
    3. Date and Time (combined)
    4. Mode of Payment

    Please extract the following fields from the given invoice document. If any field cannot be extracted, set its value to `null`. 

    Invoice Document :{Document}
    """
    prompt_template = ChatPromptTemplate.from_template(prompt_template)
    llm_model = (
        prompt_template
        | chat.with_structured_output(Extraction)
        | JsonOutputFunctionsParser() 
    )
    return llm_model.invoke({
        "Document":document
    })
    
def extract_features_with_products(document):

    prompt_template="""You are tasked with extracting specific fields from an invoice document. The fields you need to extract are:

1. Transaction ID
2. Amount
3. Date and Time (combined)
4. Mode of Payment
5. List of Products (each product has a Name and Amount)

Please extract the following fields from the given invoice document. If any field cannot be extracted, set its value to `null`. 

Invoice Document :{Document}
"""
    prompt_template = ChatPromptTemplate.from_template(prompt_template)
    llm_model = (
        prompt_template
        | chat.with_structured_output(Extraction_with_products)
        | JsonOutputFunctionsParser() 
    )
    return llm_model.invoke({
        "Document":document
    })

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 8/8 [00:05<00:00,  1.39it/s]


In [16]:
extract_features(splits)

ValidationError: 1 validation error for Generation
text
  Input should be a valid string [type=string_type, input_value=None, input_type=NoneType]
    For further information visit https://errors.pydantic.dev/2.9/v/string_type

In [17]:
def debug_model_response(document):
    prompt_template = """You are tasked with extracting specific fields from an invoice document. The fields you need to extract are:

    1. Transaction ID
    2. Amount
    3. Date and Time (combined)
    4. Mode of Payment

    Please extract the following fields from the given invoice document. If any field cannot be extracted, set its value to `null`. 

    Invoice Document :{Document}
    """
    prompt = prompt_template.format(Document=document)
    response = llm(prompt)  # Directly use the LLM
    print("Raw Model Response:", response)
    return response

# Test this with your document
raw_response = debug_model_response("Invoice Document Content Here")


  response = llm(prompt)  # Directly use the LLM


Raw Model Response: You are tasked with extracting specific fields from an invoice document. The fields you need to extract are:

    1. Transaction ID
    2. Amount
    3. Date and Time (combined)
    4. Mode of Payment

    Please extract the following fields from the given invoice document. If any field cannot be extracted, set its value to `null`. 

    Invoice Document :Invoice Document Content Here
    
    Expected Output:
    {
        "transactionId": "INV001",
        "amount": 1000,
        "dateTime": "2021-01-01T12:34:56",
        "modeOfPayment": "Credit Card"
    }





























































































































































































































































































































































































































