In [12]:
import getpass, os, pymongo, pprint, json
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pymongo import MongoClient
from pymongo.operations import SearchIndexModel

In [8]:
import subprocess
import json
import os

# Define the data directory path
data_dir = '/Users/sina/Library/Mobile Documents/com~apple~CloudDocs/git_projects/Startup/mern-dashboard/langchain_service/data'

# Create the data directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Define the output file path
output_file = os.path.join(data_dir, 'data.json')

# Check if the file already exists
if not os.path.exists(output_file):
    # Define the curl command
    curl_command = [
        'curl', '-X', 'GET',
        'https://datasets-server.huggingface.co/rows?dataset=mychen76%2Finvoices-and-receipts_ocr_v1&config=default&split=train&offset=0&length=100'
    ]

    # Execute the curl command and capture the output
    result = subprocess.run(curl_command, capture_output=True, text=True)

    # Parse the JSON data
    data = json.loads(result.stdout)

    # Write the formatted JSON to a file
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)

    print(f"Data downloaded and saved to {output_file}")
else:
    print(f"File {output_file} already exists. Loading existing data.")
    with open(output_file, 'r') as f:
        data = json.load(f)

# Display the first few rows of the data
print("\nFirst few rows of the data:")
print(json.dumps(data['rows'][:2], indent=2))

Data downloaded and saved to /Users/sina/Library/Mobile Documents/com~apple~CloudDocs/git_projects/Startup/mern-dashboard/langchain_service/data/data.json

First few rows of the data:
[
  {
    "row_idx": 0,
    "row": {
      "image": {
        "src": "https://datasets-server.huggingface.co/cached-assets/mychen76/invoices-and-receipts_ocr_v1/--/83835c87346de32ac9223bdce5264e69ef3366ad/--/default/train/0/image/image.jpg?Expires=1727491824&Signature=Mvgn1Whal7c~-NlQ30jCUgdX60HX1-evjV9VF3JwUMV~z2bMvehUSyLsEEQVdwpH1i49fDPHI~A04Djarsq08JcxZLyw~eOOTbur76DyKrNWtkTz3jT4a~~S4TL8Wa0Mv-8k78IsNysThw9yvFeALTATRGyHJKGy3DLoS0xIWSahDOpWPcxmK8VcLc6XNGCrYjbgDm~cwY8dO~aSiiGJQ4I3fAcWr0EGcKnWlD8y~gsmzgDQn92WgSN-7Beg-9J-ei-~SVBOMf3OPPfKVjjMmFH6VUASmtdBOUcswngFd7lPQ1hdt-KXhxeP~Ca9taxRYnqBETdinS5gbQOVCywwVQ__&Key-Pair-Id=K3EI6M078Z3AC3",
        "height": 3508,
        "width": 2481
      },
      "id": "0",
      "parsed_data": "{\"xml\": \"\", \"json\": \"{'header': {'invoice_no': '40378170', 'invoice_date

In [10]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
ATLAS_CONNECTION_STRING = getpass.getpass("MongoDB Atlas SRV Connection String:")

In [25]:
# Connect to your Atlas cluster
client = MongoClient(ATLAS_CONNECTION_STRING)

# Define collection and index name
db_name = "langchain_db"
collection_name = "invoice_db"
atlas_collection = client[db_name][collection_name]
vector_search_index = "invoice_vector_index"

In [26]:
# Load the JSON data
data_dir = '/Users/sina/Library/Mobile Documents/com~apple~CloudDocs/git_projects/Startup/mern-dashboard/langchain_service/data'
json_file = os.path.join(data_dir, 'data.json')

with open(json_file, 'r') as f:
    data = json.load(f)

# Prepare documents for vector store
documents = []
for row in data['rows']:
    parsed_data = json.loads(row['row']['parsed_data'])
    content = f"Invoice ID: {row['row']['id']}\n"
    content += f"Parsed Data: {parsed_data['json']}\n"
    content += f"Raw Data: {row['row']['raw_data']}"
    
    doc = Document(
        page_content=content,
        metadata={
            "id": row['row']['id'],
            "image_url": row['row']['image']['src']
        }
    )
    documents.append(doc)

# Create the vector store
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=atlas_collection,
    index_name=vector_search_index
)

print(f"Vector store created with {len(documents)} documents.")

_id or id key found in metadata. Please pop from each dict and input as separate list.Retrieving methods will include the same id as '_id' in metadata.


Vector store created with 100 documents.


In [27]:
# Create your index model, then create the search index
search_index_model = SearchIndexModel(
    definition={
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1536,
                "similarity": "cosine"
            },
            {
                "type": "filter",
                "path": "id",
            }
        ]
    },
    name="invoice_vector_index",
    type="vectorSearch"
)

atlas_collection.create_search_index(model=search_index_model)

'invoice_vector_index'

In [29]:
query = "Patel, Thompson and Montgomery 356 Kyle Vista New James, MA 46228"
# results = vector_store.similarity_search(query, k=5, search_type="similarity")
results = vector_store.similarity_search(query)

pprint.pprint(results)

[Document(metadata={'_id': '66f784585b9fd28f7d11feff', 'id': '139', 'image_url': 'https://datasets-server.huggingface.co/cached-assets/mychen76/invoices-and-receipts_ocr_v1/--/83835c87346de32ac9223bdce5264e69ef3366ad/--/default/train/45/image/image.jpg?Expires=1727491824&Signature=agsg8gP4hLLUGtJmOf12mxlk2JTin1yw3hWVJk2qL2VwkaCA5~qyHiUKKfKhogzr6qp2iacDspHmfBz5C6A9N7LTf3YYsp2lEdNvtTHPOJ~yivyhL5zU7txF3ci-IXCDggII-gcBqlQmHcMcG9uqXe0vO~AbSgQ28Gvw5zXrST0yoiR~gZpXvlRt0eAYK9GIerx2LS6Ju0lcHtGYOsdBZgmcAl~f-UGvK237OBSJvnGyDY-B5ZbvtUjBdVZDSJoikzWvgdMqAhWI75H1HZT58UeGPS9fjmYOHk1gFBnojFYvvmeaD2VE5iTMgOS7qyWRY54CY4sI-1tLCeg8rm9geg__&Key-Pair-Id=K3EI6M078Z3AC3'}, page_content='Invoice ID: 139\nParsed Data: {\'header\': {\'invoice_no\': \'19512841\', \'invoice_date\': \'06/18/2012\', \'seller\': \'Camacho-Stewart 99489 Brown Stream Whitetown, NY 24886\', \'client\': \'Fleming Ltd 55160 Robert Way Jamesland, KY 92755\', \'seller_tax_id\': \'971-74-5288\', \'client_tax_id\': \'914-75-0393\', \'iban\': \