In [1]:

'''
# requirements
!pip install langchain
!pip install openai
'''

'\n# requirements\n!pip install langchain\n!pip install openai\n'

# PDF Loader
[Langchain PDF reference link](https://python.langchain.com/docs/modules/data_connection/document_loaders/how_to/pdf)

## Setup environment  

In [2]:
from langchain.llms import OpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

import os

api_key = os.getenv("OPENAI_API_KEY")


In [3]:
'''
Here demonstrate analyzing a product specfication from PDF file.
'''

# loader = PyPDFLoader("example_data/rxi-web-panel-emerson.pdf")
# loader = PyPDFLoader("example_data/HEM-6232T Manual - Omron Healthcare.pdf")
# loader = PyPDFLoader("example_data/CONDENSED CATALOG - lotek.dk.pdf")
loader = PyPDFLoader("example_data/10-04-580-SPC - Wall-Smart.pdf")

# pages = loader.load_and_split()

doc = loader.load()


## Text Splliter

[Dcument trasformers](https://python.langchain.com/docs/modules/data_connection/document_transformers/)
<br>
超長文本的切割

In [4]:

# doc = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=1000)
pages = splitter.split_documents(doc)

## Embeddings
The Embeddings class is a class designed for interfacing with text embedding models. 
<br>
[Text embedding models(e.g. openAI, huggingface)](https://python.langchain.com/docs/modules/data_connection/text_embedding/)

In [5]:
embeddings = OpenAIEmbeddings()


## Store openai embedding vectors
One of the most common ways to store and search over unstructured data is to embed it and store the resulting embedding vectors, and then at query time to embed the unstructured query and retrieve the embedding vectors that are 'most similar' to the embedded query. A vector store takes care of storing embedded data and performing vector search for you.
<br>
Vector store 提供儲存以及檢索非結構向量資料，透過 query 檢索最相似的向量。

[other langchain vector stores](https://python.langchain.com/docs/modules/data_connection/vectorstores/)
<br>
[🦜⛓️ + Chroma](https://blog.langchain.dev/langchain-chroma/)

In [6]:
'''
Chroma, the AI-native open-source embedding database
'''

# add persist_directory attribute to store the embeddings data
db = Chroma.from_documents(documents=pages, embedding= embeddings)


In [7]:
# Now we can load the persisted database from disk, and use it as normal. 

# persist_directory = 'chroma_db'
# db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)


## Retriever
A retriever is an interface that returns documents given an unstructured query.
<br>
Introduce chain type:stuff, refine, map_reduce
<br>
[Retrievers link](https://python.langchain.com/docs/modules/data_connection/retrievers/)

In [8]:
# from langchain.chains import RetrievalQA
# retriever = db.as_retriever(search_type= 'similarity')

# qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type='refine', retriever=retriever, return_source_documents=True)

## Keyword retrieval with prompt template

定義要面板搜尋的關鍵字詞，客製 Prompt template 


In [18]:

prompt_template = """Use the following pieces of context to answer the question, if you don't know the answer, leave it blank don't try to make up an answer.
                {context}
                Question: {question}
                Answer in JSON representations
                """
                
                
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT}

    
chain_type_kwargs = {"prompt": PROMPT}
rqa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=db.as_retriever(), chain_type_kwargs=chain_type_kwargs)   
# result = qa({"query": query})




In [19]:
query = """ What is the display specifications?
                include: company_name, product_name, size_inch,resolution,contrast,operation_temperature,sunlight_readable,antiglare,
        """
        
res = rqa.run(query)

Number of requested results 4 is greater than number of elements in index 2, updating n_results = 2


In [20]:
import json
info = json.loads(res)
info

{'company_name': 'Control4 Corporation',
 'product_name': 'Control4® T3 Series 7" Tabletop Touch Screen',
 'size_inch': 7,
 'resolution': '1280 × 800',
 'contrast': None,
 'operation_temperature': '32 ~ 104˚F (0˚ ~ 40˚C)',
 'sunlight_readable': None,
 'antiglare': None}