In [1]:
import validators
import fitz
from langchain_community.document_loaders import PyMuPDFLoader, OnlinePDFLoader
import os
from urllib.parse import urlparse
import camelot

In [2]:
class ProcessDocument:

    def __init__(self, url_or_path):
        self.url_or_path  = url_or_path

    def extract_data(self):

        images = []
        tables = []
        file_name = ""
        if validators.url(self.url_or_path):
            loader = OnlinePDFLoader(self.url_or_path)
            url_parse_obj = urlparse(url)
            file_name = os.path.basename(url_parse_obj.path)
        else:
            file_name = os.path.basename(self.url_or_path)
            loader = PyMuPDFLoader(self.url_or_path)

            # extract images
            doc = fitz.open(self.url_or_path) # open a document

            for page_index in range(len(doc)): # iterate over pdf pages
                page = doc[page_index] # get the page
                image_list = page.get_images()

                if not os.path.exists(os.path.join(os.getcwd(), file_name, 'images')):
                    os.makedirs(os.path.join(os.getcwd(), file_name, 'images'))

                for image_index, img in enumerate(image_list, start=1): # enumerate the image list
                    xref = img[0] # get the XREF of the image
                    pix = fitz.Pixmap(doc, xref) # create a Pixmap

                    if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                        pix = fitz.Pixmap(fitz.csRGB, pix)

                    image_path = "{}/{}/images/page_{}-image_{}.png".format(os.getcwd(),file_name, page_index, image_index)

                    if not os.path.exists(image_path):
                        pix.save(image_path) # save the image as png
                    
                    temp = {}
                    temp['image'] = pix
                    temp['image_path'] = image_path
                    images.append(temp)
                    pix = None

            # extract tables
            tables = camelot.read_pdf(self.url_or_path)

            tables = [table.df for table in tables]
        
        documents = loader.load()

        return documents, images, tables

In [3]:
process_doc_obj = ProcessDocument(url_or_path="/Users/satishsilveri/Downloads/echap03_vol1.pdf")

In [4]:
documents, images, tables = process_doc_obj.extract_data()

In [5]:
tables

[]

In [6]:
images

[{'image': Pixmap(DeviceRGB, (0, 0, 309, 64), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_0-image_1.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 2048, 875), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_1-image_1.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 2050, 966), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_2-image_1.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 66, 57), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_5-image_1.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 1300, 815), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_6-image_1.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 1508, 667), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_20-image_1.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 1344, 691), 0),
  'image_path': '/Users/satishsilveri/echap03_vol1.pdf/images/page_20-image_2.png'},
 {'image': Pixmap(DeviceRGB, (0, 0, 549, 61), 0),
 

In [7]:
len(documents)

37

In [8]:
import torch
from transformers import CLIPModel, CLIPProcessor
from sentence_transformers import SentenceTransformer

class EmbedData:
    def __init__(self, text_model_name: str ="sentence-transformers/all-MiniLM-L12-v2", image_model_name:str = "openai/clip-vit-base-patch32"):
        # load text model
        self.text_model = SentenceTransformer(text_model_name)
        # load image model
        self.image_model = CLIPModel.from_pretrained(image_model_name)
        self.image_processor = CLIPProcessor.from_pretrained(image_model_name)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.image_model.to(self.device)

    def get_text_embedding(self, text):
        return self.text_model.encode(text).tolist()

    @torch.no_grad()
    def get_query_embeddings_for_image(self, query):
        inputs = self.image_processor(text=query, return_tensors="pt")
        inputs = inputs.to(self.device)
        query_features = self.image_model.get_text_features(**inputs)
        query_features /= query_features.norm(dim=-1, keepdim=True)
        return query_features.tolist()

    @torch.no_grad()
    def get_image_embeddings(self, images):
        inputs = self.processor(images=images, return_tensors="pt")
        inputs = inputs.to(self.device)
        image_embeddings = self.model.get_image_features(**inputs)
        image_embeddings /= image_embeddings.norm(dim=-1, keepdim=True)
        return image_embeddings.tolist()

In [9]:
from nltk.tokenize import sent_tokenize
from langchain_community.vectorstores import Chroma
from langchain.docstore.document import Document
import re
from langchain_community.embeddings import HuggingFaceEmbeddings
# from embed_data import EmbedData

class IndexData:

    def __init__(self):
        self.embed_data_obj = EmbedData()

    def clean_text(self, text):
        # Remove escape characters
        cleaned_text = text.encode('ascii', 'ignore').decode()
        
        # Remove unwanted spaces
        cleaned_text = ' '.join(cleaned_text.split())
        
        cleaned_text = cleaned_text.lower()
        
        return cleaned_text

    def split_text_by_token_length(self, text: str = "", token_length: int = 400):
        '''
        '''
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue
            
            current_chunk.append(sentence)
            if sum(len(s.split()) for s in current_chunk) > token_length:
                chunks.append('. '.join(current_chunk[:-1]))
                current_chunk = [current_chunk[-1]]

        if current_chunk:
            chunks.append('. '.join(current_chunk))

        return chunks

    def split_text_by_token_length_with_sliding_window(self, text: str = "", token_length: int = 400):
        '''
        '''
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        window_size = 0

        for sentence in sentences:
            sentence = sentence.strip()
            if not sentence:
                continue

            current_chunk.append(sentence)
            window_size += len(sentence.split())

            while window_size > token_length:
                chunks.append('. '.join(current_chunk[:-1]))
                window_size -= len(current_chunk[0].split())
                current_chunk.pop(0)

        if current_chunk:
            chunks.append('. '.join(current_chunk))

        return chunks

    def index_text(self, documents, token_length : int = 400, overlap: bool = False, k :int = 5):
        '''
        '''

        # split documents
        splitted_documents = []

        for document in documents:
            if overlap:
                chunks = self.split_text_by_token_length_with_sliding_window(text = document.page_content, token_length=token_length)
            else:
                chunks = self.split_text_by_token_length(text = document.page_content, token_length=token_length)

            for chunk_idx, chunk in enumerate(chunks):
                temp_metadata = document.metadata
                temp_metadata['chunk_id'] = chunk_idx
                # superficial cleaning
                cleaned_chunk = self.clean_text(text = chunk)
                temp_doc = Document(page_content = cleaned_chunk, metadata=temp_metadata)
                splitted_documents.append(temp_doc)
        
        # initialize vector database and add docs
        text_db = Chroma.from_documents(documents = splitted_documents, collection_name="documents_collection", embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2"))

        # convert to retriever
        retriever = text_db.as_retriever(search_type="similarity",search_kwargs={"k": k})

        return retriever


    def index_images(self, images):
        '''
        '''
        # generate image embeddings
        images_embeddings = []
        docs = []
        for img_obj in images:
            image_embeddings.append(self.embed_data_obj.get_image_embeddings(images = img_obj['image']))
            temp_doc = Document(page_content = "", metadata={'image_path':img_obj['image_path']})
            docs.append(temp_doc)

        # initialize image vector db and add embeddinghs
        images_db = Chroma.from_documents(documents=docs, embeddings=image_embeddings, embedding_function=self.embed_data_obj.get_query_embeddings_for_image(), collection_name="images_collection")

        return images_db

In [10]:
index_data_obj = IndexData()
text_retriever = index_data_obj.index_text(documents, token_length = 400, overlap = True, k = 5)

  return self.fget.__get__(instance, owner)()


In [11]:
text_retriever.get_relevant_documents("When was India's sovereign credit rating speculative grade?")

[Document(page_content='87 does indias sovereign credit rating reflect its fundamentals no!. table 1: indias sovereign credit rating (1998-2020) date s&p moodys fitch june 1998 ba2* october 1998 bb* march 2000 bb+* november 2001 bb* february 2003 ba1* january 2004 bb+* january 2004 baa3 february 2005 bb+* august 2006 bbb- january 2007 bbb- november 2017 baa2 june 2020 baa3 *speculative grade; green highlights ratings upgrade; red highlights ratings downgrade, black indicates first rating source: compiled from s&p global, fitch and moodys box 1: what are sovereign credit ratings?. sovereign credit ratings seek to quantify issuers ability to meet debt obligations.. when favourable, these can facilitate countries access to global capital markets and foreign investment.. table below presents what three key cras s&p, moodys and fitch, seek to measure.. what credit ratings measure fitch "credit ratings express risk in relative rank order, which is to say they are ordinal measures of credit r

In [12]:
# from langchain_openai import ChatOpenAI
# from langchain_core.prompts import ChatPromptTemplate
# from langchain.schema import StrOutputParser
# from langchain_core.runnables import RunnablePassthrough, RunnableParallel

# template = """Answer the question based only on the following context:
# {context}

# Question: {question}
# """
# prompt = ChatPromptTemplate.from_template(template)
# model = ChatOpenAI()
# output_parser = StrOutputParser()

# setup_and_retrieval = RunnableParallel(
#     {"context": text_retriever, "question": RunnablePassthrough()}
# )

# chain = setup_and_retrieval | prompt | model | output_parser

# chain.invoke("When was India's sovereign credit rating speculative grade?")


In [13]:
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain import LLMChain
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers

device = "cpu" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

text_generation_pipeline = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0,
    repetition_penalty=1.1,
    return_full_text=False,
    max_new_tokens=300,
)

prompt_template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

mistral_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create prompt from prompt template 
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Create llm chain 
llm_chain = LLMChain(llm=mistral_llm, prompt=prompt)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
from langchain_core.runnables import RunnablePassthrough

query = "When was India's sovereign credit rating speculative grade?" 

rag_chain = ( 
 {"context": text_retriever, "question": RunnablePassthrough()}
    | llm_chain
)

rag_chain.invoke(query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'context': [Document(page_content='87 does indias sovereign credit rating reflect its fundamentals no!. table 1: indias sovereign credit rating (1998-2020) date s&p moodys fitch june 1998 ba2* october 1998 bb* march 2000 bb+* november 2001 bb* february 2003 ba1* january 2004 bb+* january 2004 baa3 february 2005 bb+* august 2006 bbb- january 2007 bbb- november 2017 baa2 june 2020 baa3 *speculative grade; green highlights ratings upgrade; red highlights ratings downgrade, black indicates first rating source: compiled from s&p global, fitch and moodys box 1: what are sovereign credit ratings?. sovereign credit ratings seek to quantify issuers ability to meet debt obligations.. when favourable, these can facilitate countries access to global capital markets and foreign investment.. table below presents what three key cras s&p, moodys and fitch, seek to measure.. what credit ratings measure fitch "credit ratings express risk in relative rank order, which is to say they are ordinal measures