In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.chat_models import ChatOllama
from langchain_community.document_loaders import PyPDFLoader
from PIL import Image



In [2]:
#trying different llm
llme = ChatOllama(model="mistral")

In [3]:
import easyocr

# Initialize the reader for the desired languages
reader = easyocr.Reader(['en', 'es'] , gpu=True)  # English and Spanish

# Read text from an image
text = reader.readtext('ooga/acetone-acs-l-2.png', detail=0)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [4]:
#text splitter and stuff
loader = PyPDFLoader("data/acetone-acs-l.pdf")
pages = loader.load_and_split()
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

ValueError: File path OOgaBoo.pdf is not a valid file or url

In [6]:
#embed and store it in a vec db 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma.from_documents(docs, embeddings)
retriever = db.as_retriever()

  from tqdm.autonotebook import tqdm, trange


AttributeError: 'str' object has no attribute 'page_content'

In [5]:
#make a template for the output
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")

In [6]:
#retrival 
combine_docs_chain = create_stuff_documents_chain(
    llme, prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [7]:
response = retrieval_chain.invoke({"input": "Here is some text extracted from a PDF. Analyze it and generate structured JSON where main sections are parent keys and subsections are children. clean the one which "})
print(response["answer"])

 {
      "Name": "Acetone",
      "Revision Date": "13-Oct-2023",
      "Method": "CC (closed cup)",
      "Evaporation Rate": "5.6 (Butyl Acetate = 1.0)",
      "Flammability": {
          "Not applicable": true
      },
      "Flammability or explosive limits": {
          "Upper": "12.8 vol %",
          "Lower": "2.5 vol %"
      },
      "Vapor Pressure": "247 mbar  @  20 °C",
      "Vapor Density": "2.0",
      "Specific Gravity": "0.790",
      "Solubility": "Soluble in water",
      "Partition coefficient; n-octanol/water": "No data available",
      "Autoignition Temperature": {
          "°C": "465",
          "°F": "869"
      },
      "Decomposition Temperature": "> 4°C",
      "Viscosity": "0.32 mPa.s @ 20 °C",
      "Molecular Formula": "C3 H6 O",
      "Molecular Weight": "58.08",
      "VOC Content(%)": "100",
      "Refractive index": "1.358 - 1.359",
      "Stability and reactivity": {
          "Reactive Hazard": "None known, based on information available",
        