Install Package

In [None]:
!pip install -q pypdf python-dotenv transformers llama-index sentence-transformers langchain
!pip install -U langchain-community

Import Package

In [None]:
import os
from PIL import Image
from pathlib import Path
import torch
from transformers import BitsAndBytesConfig
import json

# llama_index
from llama_index.core.prompts import PromptTemplate
from llama_index.legacy.llms import HuggingFaceLLM
from llama_index.legacy import download_loader, SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.legacy.node_parser import SentenceSplitter
from llama_index.legacy.schema import IndexNode
from langchain.embeddings import HuggingFaceInstructEmbeddings
from llama_index.legacy.response.notebook_utils import display_source_node
from llama_index.legacy.retrievers import RecursiveRetriever
from llama_index.legacy.query_engine import RetrieverQueryEngine
from llama_index.legacy.vector_stores import ChromaVectorStore
from llama_index.legacy.storage.storage_context import StorageContext
from datetime import datetime

from llama_index.core.prompts.prompts import SimpleInputPrompt

# Metadata Extraction
from llama_index.legacy.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
)

Load Model

Login huggingface cli

In [None]:

!git config --global credential.helper store
!huggingface-cli login

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


In [None]:
system_prompt = "You are a data extractor. Extract the exact data from given Text document. If no information found please reply 'NO DATA FOUND'"
query_wrapper_prompt = SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    # query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name='mistralai/Mistral-7B-Instruct-v0.2',
    model_name='mistralai/Mistral-7B-Instruct-v0.2',
    device_map="auto",
    # model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True, "use_auth_token":True},

)


In [None]:

!pip install -U sentence-transformers==2.2.2
!pip install InstructorEmbedding


In [None]:
embed_model = HuggingFaceInstructEmbeddings(
              model_name="hkunlp/instructor-large",
              model_kwargs={"device": device},
              encode_kwargs={"normalize_embeddings": True}

                  )

service_context = ServiceContext.from_defaults(
              chunk_size=256,
              llm=llm,
              embed_model=embed_model)

In [None]:
image_data_location= "/content/"
documents=SimpleDirectoryReader(image_data_location).load_data()
print(documents[0])

In [None]:
index= VectorStoreIndex.from_documents(documents, service_context=service_context)

In [None]:
user_question_query=["what is date?","what is beneficary name?"]

for i, user_question in enumerate(user_question_query):
  query_engine = index.as_query_engine(similarity_top_k=5)
  response = query_engine.query(user_question)
  print(f"Question {i+1}: {user_question} and {response}")

QA= f"{user_question} + {response}"
with open(f"response_{i+1}.txt", "w") as f:
  f.write(str(QA) )
