In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
GCS_OUTPUT_PATH = "gs://doc_parser12345/"
# PROCESSOR_NAME = "https://us-documentai.googleapis.com/v1/projects/712357922075/locations/us/processors/5d98407fb8df954c:process"
PROCESSOR_NAME = "projects/712357922075/locations/us/processors/5d98407fb8df954c"

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = './docextraction-440803-2193a2f4556f.json'


In [3]:
from langchain_core.document_loaders.blob_loaders import Blob
from langchain_google_community import DocAIParser

In [4]:
parser = DocAIParser(
    location="us", processor_name=PROCESSOR_NAME, gcs_output_path=GCS_OUTPUT_PATH
)

In [5]:
blob = Blob(
    path="gs://doc_parser12345/document-001-115484.out.000.pdf"
)

In [6]:
docs = list(parser.lazy_parse(blob))
print(len(docs))

1


In [7]:
print(len(docs))

1


In [8]:
docs

[Document(metadata={'page': 1, 'source': 'gs://doc_parser12345/document-001-115484.out.000.pdf'}, page_content='SHARP\nGL1F20\nGL1F20\nFeatures\n1. IrDA1.0 compatible infrared emitting diode\n(Transmission rate: 2.4 to 115.2kbps)\n2. Built-in infrared emitting diode circuit\n3. Recommended use in combination with detector (IS1U20)\nInfrared Communication\n(IrDA1.0 Compatible)\nInfraredEmitting Diode\n| Outline Dimensions\nMAX. 0.6\n5.6\n(Unit: mm)\nR 1.4\nTransparent resin\nDetector center\nApplications\n1. Personal computers\n2. Portable information terminal equipment\n3. Printers\n4. Word processors\nIrDA Abbreviation of the Infrared Data Association established\nfor standardization of infrared communication specifications\n3-0.45\n1.27\nAbsolute Maximum Ratings\nParameter\nForward current\n*1 Peak forward current\nOperating temperature\nStorage temperature\n#2\nSoldering temperature\nA\n17.9\n1.0\n1.27\nMIN 0.3\n(Ta=25°C)\nSymbol Rating\nUnit\nIF\n50\nmA\nTerminal configuration\n1 B

In [9]:
type(docs[0])

langchain_core.documents.base.Document

In [10]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))


In [26]:
from langchain import hub

# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)



In [30]:
from langchain_google_genai import ChatGoogleGenerativeAI

model = ChatGoogleGenerativeAI(model="gemini-1.5-flash")


In [31]:
from typing import Optional

from pydantic import BaseModel, Field

class ElectricalSpecifications(BaseModel):
    forward_current: float = Field(..., description="Forward current in amperes.")
    peak_forward_current: float = Field(..., description="Peak forward current in amperes.")
    operating_temperature: float = Field(..., description="Operating temperature in degrees Celsius.")
    storage_temperature: float = Field(..., description="Storage temperature in degrees Celsius.")
    soldering_temperature: float = Field(..., description="Soldering temperature in degrees Celsius.")

In [32]:
structured_llm = model.with_structured_output(ElectricalSpecifications)


In [33]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | structured_llm
)

In [34]:
response = rag_chain.invoke("Extract the forward current  peak forward current, operating temperature, storage temperature, soldering temperature?")

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


In [35]:
response

ElectricalSpecifications(forward_current=50.0, peak_forward_current=400.0, operating_temperature=70.0, storage_temperature=85.0, soldering_temperature=260.0)

In [42]:
response = rag_chain.invoke("give me the rating info?")

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


In [None]:
response.

ElectricalSpecifications(forward_current=50.0, peak_forward_current=400.0, operating_temperature=70.0, storage_temperature=85.0, soldering_temperature=260.0)

In [44]:
retriever.invoke("Extract the forward current  peak forward current, operating temperature, storage temperature, soldering temperature?")

Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1


[Document(metadata={'page': 1, 'source': 'gs://doc_parser12345/document-001-115484.out.000.pdf'}, page_content='SHARP\nGL1F20\nGL1F20\nFeatures\n1. IrDA1.0 compatible infrared emitting diode\n(Transmission rate: 2.4 to 115.2kbps)\n2. Built-in infrared emitting diode circuit\n3. Recommended use in combination with detector (IS1U20)\nInfrared Communication\n(IrDA1.0 Compatible)\nInfraredEmitting Diode\n| Outline Dimensions\nMAX. 0.6\n5.6\n(Unit: mm)\nR 1.4\nTransparent resin\nDetector center\nApplications\n1. Personal computers\n2. Portable information terminal equipment\n3. Printers\n4. Word processors\nIrDA Abbreviation of the Infrared Data Association established\nfor standardization of infrared communication specifications\n3-0.45\n1.27\nAbsolute Maximum Ratings\nParameter\nForward current\n*1 Peak forward current\nOperating temperature\nStorage temperature\n#2\nSoldering temperature\nA\n17.9\n1.0\n1.27\nMIN 0.3\n(Ta=25°C)\nSymbol Rating\nUnit\nIF\n50\nmA\nTerminal configuration\n1 B