In [1]:
!pip install PyPDF2
!pip install -qqq llama-index llama-hub langchain openai accelerate==0.21.0 bitsandbytes==0.40.2 InstructorEmbedding chromadb
!pip install sentence-transformers==2.2.2
!pip install -U langchain-community

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.9/103.9 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m69.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.6/320.6 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m48.4 MB/s[0m eta [36m0:00:

In [2]:
import json
import torch
from pathlib import Path

# transformers
from transformers import BitsAndBytesConfig

# llama_index
from llama_index.core.prompts import PromptTemplate
from llama_index.legacy.llms import HuggingFaceLLM
from llama_index.legacy import download_loader, Document, VectorStoreIndex, ServiceContext
from llama_index.legacy.node_parser import SentenceSplitter
from llama_index.legacy.schema import IndexNode
from langchain.embeddings import HuggingFaceInstructEmbeddings
from llama_index.legacy.response.notebook_utils import display_source_node
from llama_index.legacy.retrievers import RecursiveRetriever
from llama_index.legacy.query_engine import RetrieverQueryEngine
from llama_index.legacy.vector_stores import ChromaVectorStore
from llama_index.legacy.storage.storage_context import StorageContext
from llama_index.core.prompts.prompts import SimpleInputPrompt

# Metadata Extraction
from llama_index.legacy.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)

# db
import chromadb

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /usr/local/lib/python3.10/dist-
[nltk_data]     packages/llama_index/legacy/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
import PyPDF2

def extract_text_from_pdf(pdf_file_path):
    with open(pdf_file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text


In [4]:
pdf_file_path="/content/Invoice 2085.pdf"

documents=[Document(text=extract_text_from_pdf(pdf_file_path))]

In [5]:
node_parser=SentenceSplitter(chunk_size=1024)

In [6]:
base_nodes= node_parser.get_nodes_from_documents(documents)
for node in base_nodes:
  print(node.id_)

777e8a0b-0fbf-4873-a8fb-f7b50534fbe5


In [7]:
from huggingface_hub import login


LLM Mistral-7B-Instruct-v0.2

In [18]:
from google.colab import userdata


access_token_read = "YOUR HUGGINGFACE ACCESS TOKEN"
login(token = access_token_read)
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_compute_dtype=torch.float16,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_use_double_quant=True,
# )

SYSTEM_PROMPT = """

    Task: Your task is to process the Invoice PDF Text provided and extract specific information based on the following schema.
     After extracting the relevant data


    1. Invoice Date
    2. Invoice Number
    3. Total Ammount


    **Instructions:**
    Only return JSON code snippet
    1. Utilize the OCR output provided as input.
    2. Extract information corresponding to the fields in the schema above.
    3. Ensure all the required fields are extracted.
    4. Format the extracted data as per the given schema.
    5. If a required field is not found, leave the corresponding schema field empty.
    6. Provide the output in only in JSON format adhering to the specified schema.



    """





llm = HuggingFaceLLM(
  context_window=2048,
  max_new_tokens=512,
  generate_kwargs={"temperature": 0.0, "do_sample": False},
  system_prompt=SYSTEM_PROMPT ,
  tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
  model_name= "meta-llama/Llama-2-7b-chat-hf",
  device_map="auto",

  # uncomment this if using CUDA to reduce memory usage
  model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


ValueError: 
                    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the
                    quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules
                    in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to
                    `from_pretrained`. Check
                    https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
                    for more details.
                    

In [10]:
embed_model = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large", model_kwargs={"device": DEVICE}
)

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.41k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512


In [11]:
service_context = ServiceContext.from_defaults(
    llm=llm, embed_model=embed_model
)

In [12]:
base_index = VectorStoreIndex(base_nodes, service_context=service_context)
base_retriever = base_index.as_retriever(similarity_top_k=2)



In [15]:
query_engine_base = RetrieverQueryEngine.from_args(
    base_retriever, service_context=service_context
)
response = query_engine_base.query("""
          Only give json response
          Extract values of following fields from given OCR text:
          1) Invoice Date
          2) Invoice Number No
          3) Total Ammount


          If you do not find the value then keep it empty
      """)
print(str(response))

22/01/2024

{
"properties": {
"Invoice Number": {"type": "string", "value": "AVANTEL-09122023-0001"}
}
}

{
"properties": {
"Total Ammount": {"type": "number", "format": "currency", "value": "3,83,900.00"},
}
}
