In [1]:
import getpass, os, pymongo, pprint, json
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pymongo import MongoClient
from pymongo.operations import SearchIndexModel



In [8]:
import subprocess
import json
import os

# Define the data directory path
data_dir = '/Users/sina/Library/Mobile Documents/com~apple~CloudDocs/git_projects/Startup/mern-dashboard/langchain_service/data'

# Create the data directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Define the output file path
output_file = os.path.join(data_dir, 'data.json')

# Check if the file already exists
if not os.path.exists(output_file):
    # Define the curl command
    curl_command = [
        'curl', '-X', 'GET',
        'https://datasets-server.huggingface.co/rows?dataset=mychen76%2Finvoices-and-receipts_ocr_v1&config=default&split=train&offset=0&length=100'
    ]

    # Execute the curl command and capture the output
    result = subprocess.run(curl_command, capture_output=True, text=True)

    # Parse the JSON data
    data = json.loads(result.stdout)

    # Write the formatted JSON to a file
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)

    print(f"Data downloaded and saved to {output_file}")
else:
    print(f"File {output_file} already exists. Loading existing data.")
    with open(output_file, 'r') as f:
        data = json.load(f)

# Display the first few rows of the data
print("\nFirst few rows of the data:")
print(json.dumps(data['rows'][:2], indent=2))

Data downloaded and saved to /Users/sina/Library/Mobile Documents/com~apple~CloudDocs/git_projects/Startup/mern-dashboard/langchain_service/data/data.json

First few rows of the data:
[
  {
    "row_idx": 0,
    "row": {
      "image": {
        "src": "https://datasets-server.huggingface.co/cached-assets/mychen76/invoices-and-receipts_ocr_v1/--/83835c87346de32ac9223bdce5264e69ef3366ad/--/default/train/0/image/image.jpg?Expires=1727491824&Signature=Mvgn1Whal7c~-NlQ30jCUgdX60HX1-evjV9VF3JwUMV~z2bMvehUSyLsEEQVdwpH1i49fDPHI~A04Djarsq08JcxZLyw~eOOTbur76DyKrNWtkTz3jT4a~~S4TL8Wa0Mv-8k78IsNysThw9yvFeALTATRGyHJKGy3DLoS0xIWSahDOpWPcxmK8VcLc6XNGCrYjbgDm~cwY8dO~aSiiGJQ4I3fAcWr0EGcKnWlD8y~gsmzgDQn92WgSN-7Beg-9J-ei-~SVBOMf3OPPfKVjjMmFH6VUASmtdBOUcswngFd7lPQ1hdt-KXhxeP~Ca9taxRYnqBETdinS5gbQOVCywwVQ__&Key-Pair-Id=K3EI6M078Z3AC3",
        "height": 3508,
        "width": 2481
      },
      "id": "0",
      "parsed_data": "{\"xml\": \"\", \"json\": \"{'header': {'invoice_no': '40378170', 'invoice_date

In [2]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
ATLAS_CONNECTION_STRING = getpass.getpass("MongoDB Atlas SRV Connection String:")

In [21]:
# Connect to your Atlas cluster
client = MongoClient(ATLAS_CONNECTION_STRING)

# Define collection and index name
db_name = "langchain_db"
collection_name = "invoice_db"
atlas_collection = client[db_name][collection_name]
vector_search_index = "invoice_vector_index"

In [22]:
# Load the JSON data
data_dir = '/Users/sina/Library/Mobile Documents/com~apple~CloudDocs/git_projects/Startup/mern-dashboard/langchain_service/data'
json_file = os.path.join(data_dir, 'data.json')

with open(json_file, 'r') as f:
    data = json.load(f)

# Prepare documents for vector store
documents = []
for row in data['rows']:
    parsed_data = json.loads(row['row']['parsed_data'])
    content = f"invoice_id: {row['row']['id']}\n"
    content += f"parsed_data: {parsed_data['json']}\n"
        
    doc = Document(
        page_content=content,
        metadata={
            "id": row['row']['id'],
            "image_height": row['row']['image']['height'],
            "image_width": row['row']['image']['width'],
        }
    )
    documents.append(doc)

# Create the vector store
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=atlas_collection,
    index_name=vector_search_index
)

print(f"Vector store created with {len(documents)} documents.")

_id or id key found in metadata. Please pop from each dict and input as separate list.Retrieving methods will include the same id as '_id' in metadata.


Vector store created with 100 documents.


In [8]:
# Define the new collection for image bytes
image_collection_name = "image_bytes"
image_collection = client[db_name][image_collection_name]

# Create documents for image bytes
image_documents = []
for row in data['rows']:
    image_doc = {
        "invoice_id": row['row']['id'],  # Using invoice_id as a linking field
        "image_bytes": row['row']['image']['bytes'],
        "raw_ocr": row['row']['raw_data']
    }
    image_documents.append(image_doc)

# Insert documents into the image collection
# Using insert_many for better performance with multiple documents
if image_documents:
    try:
        # First, create an index on invoice_id for faster lookups
        image_collection.create_index("invoice_id", unique=True)
        
        # Insert documents with ordered=False for better performance
        result = image_collection.insert_many(image_documents, ordered=False)
        print(f"Successfully inserted {len(result.inserted_ids)} image documents")
        
    except pymongo.errors.BulkWriteError as e:
        # Handle potential duplicate key errors
        print(f"Encountered some errors during insertion: {e.details}")
        print(f"Successfully inserted documents: {len(e.details.get('writeErrors', []))}")

    except Exception as e:
        print(f"An error occurred: {e}")

Successfully inserted 100 image documents


In [23]:
# Create your index model, then create the search index
search_index_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1536,
                "similarity": "cosine"
            },
            {
                "type": "filter",
                "path": "id",
            }
        ]
    },
    name="invoice_vector_index",
    type="vectorSearch"
)

atlas_collection.create_search_index(model=search_index_model)

'invoice_vector_index'

### Semantic Search with Score

In [24]:
query = "Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228"
results = vector_store.similarity_search(query)

pprint.pprint(results)

[Document(metadata={'_id': '6722888f4c84a04060eab4ef', 'id': '0', 'image_height': 3508, 'image_width': 2481}, page_content='invoice_id: 0\nparsed_data: {\'header\': {\'invoice_no\': \'40378170\', \'invoice_date\': \'10/15/2012\', \'seller\': \'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228\', \'client\': \'Jackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601\', \'seller_tax_id\': \'958-74-3511\', \'client_tax_id\': \'998-87-7723\', \'iban\': \'GB77WRBQ31965128414006\'}, \'items\': [{\'item_desc\': "Leed\'s Wine Companion Bottle Corkscrew Opener Gift Box Set with Foil Cutter", \'item_qty\': \'1,00\', \'item_net_price\': \'7,50\', \'item_net_worth\': \'7,50\', \'item_vat\': \'10%\', \'item_gross_worth\': \'8,25\'}], \'summary\': {\'total_net_worth\': \'$7,50\', \'total_vat\': \'$0,75\', \'total_gross_worth\': \'$8,25\'}}\n'),
 Document(metadata={'_id': '6722888f4c84a04060eab53d', 'id': '169', 'image_height': 3508, 'image_width': 2481}, page_co

### Semamtic Search with Filter

In [25]:
# query = "Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228"
query = "Invoice no: 40378170, invoice date 10/15/2012"
results = vector_store.similarity_search_with_score(
    query = query,
    k = 5,
    pre_filter = { "id": { "$in": ["0"] } }
)
pprint.pprint(results)

[(Document(metadata={'_id': '6722888f4c84a04060eab4ef', 'id': '0', 'image_height': 3508, 'image_width': 2481}, page_content='invoice_id: 0\nparsed_data: {\'header\': {\'invoice_no\': \'40378170\', \'invoice_date\': \'10/15/2012\', \'seller\': \'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228\', \'client\': \'Jackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601\', \'seller_tax_id\': \'958-74-3511\', \'client_tax_id\': \'998-87-7723\', \'iban\': \'GB77WRBQ31965128414006\'}, \'items\': [{\'item_desc\': "Leed\'s Wine Companion Bottle Corkscrew Opener Gift Box Set with Foil Cutter", \'item_qty\': \'1,00\', \'item_net_price\': \'7,50\', \'item_net_worth\': \'7,50\', \'item_vat\': \'10%\', \'item_gross_worth\': \'8,25\'}], \'summary\': {\'total_net_worth\': \'$7,50\', \'total_vat\': \'$0,75\', \'total_gross_worth\': \'$8,25\'}}\n'),
  0.9009585380554199)]


### Basic RAG

This example does the following:

- Instantiates Atlas Vector Search as a retriever to query for similar documents, including the optional k parameter to search for only the 10 most relevant documents.

- Defines a LangChain prompt template to instruct the LLM to use these documents as context for your query. LangChain passes these documents to the {context} input variable and your query to the {question} variable.

- Constructs a chain that specifies the following:

  - Atlas Vector Search as the retriever to search for documents that are used as context by the LLM.

  - The prompt template that you constructed.

  - OpenAI's chat model as the LLM used to generate a context-aware response.

- Prompts the chain with a sample query about Atlas security recommendations.

- Returns the LLM's response and the documents used as context. The generated response might vary.


In [26]:
# Instantiate the Vector Search as a Retriever
retriever = vector_store.as_retriever(
    search_type = "similarity",
    search_kwargs = { "k": 5 }
)

# Define a prompt template
template = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
{context}
Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

def format_docs(docs):
    formatted_docs = []
    for doc in docs:
        formatted_doc = f"Invoice ID: {doc.metadata['id']}\n"
        formatted_doc += f"Content: {doc.page_content}\n"
        formatted_docs.append(formatted_doc)
    return "\n\n".join(formatted_docs)

# construct a chain to answer questions on your data
rag_chain = (
    { "context": retriever | format_docs, "question": RunnablePassthrough() }
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

# Prompt the chain with a query
# question = "Whos is the seller for invoice no: 40378170"
question = "what is the invoice number for the seller that has the following description: 'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228'"
answer = rag_chain.invoke(question)

print("Question:" + question)
print("Answer:" + answer)

# return source documents
documents = retriever.invoke(question)
print("\nSource documents:")
pprint.pprint(documents)

Question:what is the invoice number for the seller that has the following description: 'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228'
Answer:The invoice number for the seller 'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228' is '40378170'.

Source documents:
[Document(metadata={'_id': '6722888f4c84a04060eab4ef', 'id': '0', 'image_height': 3508, 'image_width': 2481}, page_content='invoice_id: 0\nparsed_data: {\'header\': {\'invoice_no\': \'40378170\', \'invoice_date\': \'10/15/2012\', \'seller\': \'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228\', \'client\': \'Jackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601\', \'seller_tax_id\': \'958-74-3511\', \'client_tax_id\': \'998-87-7723\', \'iban\': \'GB77WRBQ31965128414006\'}, \'items\': [{\'item_desc\': "Leed\'s Wine Companion Bottle Corkscrew Opener Gift Box Set with Foil Cutter", \'item_qty\': \'1,00\', \'item_net_price\': \'7,50\', \'item_net_worth\': 

### RAG with Filtering

This example does the following:

- Instantiates Atlas Vector Search as a retriever to query for similar documents, including the following optional parameters:

  - `k` to search for only the `5` most relevant documents.

  - `score_threshold` to use only documents with a relevance score above `0.75`.

    - Note
    This parameter refers to a relevance score that Langchain uses to normalize your results, and not the relevance score used in Atlas Search queries. To use Atlas Search scores in your RAG implementation, define a custom retriever that uses the similarity_search_with_score method and filters by the Atlas Search score.

  - `pre_filter` to filter on the  `id` field for documents that appear with id `0` only.

- Defines a LangChain prompt template to instruct the LLM to use these documents as context for your query. LangChain passes these documents to the  `{context}` input variable and your query to the `{question}` variable.

- Constructs a chain that specifies the following:

  - Atlas Vector Search as the retriever to search for documents that are used as context by the LLM.

  - The prompt template that you constructed.

  - OpenAI's chat model as the LLM used to generate a context-aware response.

- Prompts the chain with a sample query about Atlas security recommendations.

- Returns the LLM's response and the documents used as context. The generated response might vary.


In [27]:
# Instantiate Atlas Vector Search as a retriever
retriever = vector_store.as_retriever(
   search_type = "similarity_score_threshold",
   search_kwargs = {
      "k": 10,
      "score_threshold": 0.1,
      "pre_filter": { "id": { "$in": ["0"] } }
   }
)

# Define a prompt template
template = """
Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
"""
custom_rag_prompt = PromptTemplate.from_template(template)

llm = ChatOpenAI(model="gpt-4o")

def format_docs(docs):
    formatted_docs = []
    for doc in docs:
        formatted_doc = f"Invoice ID: {doc.metadata['id']}\n"
        formatted_doc += f"Content: {doc.page_content}\n"
        formatted_docs.append(formatted_doc)
    return "\n\n".join(formatted_docs)

# Construct a chain to answer questions on your data
rag_chain = (
   { "context": retriever | format_docs, "question": RunnablePassthrough()}
   | custom_rag_prompt
   | llm
   | StrOutputParser()
)

# Prompt the chain
# question = "Whos is the seller for invoice no: 40378170"
question = "what is the invoice number for the seller that has the following description: 'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228'"
answer = rag_chain.invoke(question)

print("Question: " + question)
print("Answer: " + answer)

# Return source documents
documents = retriever.invoke(question)
print("\nSource documents:")
pprint.pprint(documents)

Question: what is the invoice number for the seller that has the following description: 'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228'
Answer: The invoice number for the seller 'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228' is 40378170.

Source documents:
[Document(metadata={'_id': '6722888f4c84a04060eab4ef', 'id': '0', 'image_height': 3508, 'image_width': 2481}, page_content='invoice_id: 0\nparsed_data: {\'header\': {\'invoice_no\': \'40378170\', \'invoice_date\': \'10/15/2012\', \'seller\': \'Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228\', \'client\': \'Jackson, Odonnell and Jackson 267 John Track Suite 841 Jenniferville, PA 98601\', \'seller_tax_id\': \'958-74-3511\', \'client_tax_id\': \'998-87-7723\', \'iban\': \'GB77WRBQ31965128414006\'}, \'items\': [{\'item_desc\': "Leed\'s Wine Companion Bottle Corkscrew Opener Gift Box Set with Foil Cutter", \'item_qty\': \'1,00\', \'item_net_price\': \'7,50\', \'item_net_worth\': 

In [None]:
# def setup_rag_pipeline(vector_store, config: Config):
#     retriever = vector_store.as_retriever(
#         search_type="similarity",
#         search_kwargs={"k": config.RETRIEVER_K}
#     )

#     template = """
#     You are an AI assistant analyzing invoice data. Use the following context to answer the question.
#     If the answer cannot be determined from the context, say "I cannot answer this based on the available information."
    
#     Context:
#     {context}
    
#     Question: {question}
    
#     Remember to:
#     - Only use information from the provided context
#     - Be specific and cite invoice IDs when relevant
#     - Indicate if any information is unclear or missing
    
#     Answer:
#     """
#     custom_rag_prompt = PromptTemplate.from_template(template)

#     llm = ChatOpenAI(
#         model=config.LLM_MODEL, 
#         temperature=config.LLM_TEMPERATURE,
#         request_timeout=config.REQUEST_TIMEOUT,
#         max_retries=config.MAX_RETRIES
#     )

#     def format_docs(docs):
#         formatted_docs = []
#         for doc in docs:
#             formatted_doc = f"Invoice ID: {doc.metadata['id']}\n"
#             formatted_doc += f"Content: {doc.page_content}\n"
#             formatted_docs.append(formatted_doc)
#         return "\n\n".join(formatted_docs)

#     rag_chain = (
#         {"context": retriever | format_docs, "question": RunnablePassthrough()}
#         | custom_rag_prompt
#         | llm
#         | StrOutputParser()
#     )

#     return rag_chain, retriever