In [17]:
# !pip install -q PyMuPDF pytesseract tesseract langchain transformers 
# !pip install pandas langchain langchain-openai langchain-community langchain-core openai faiss-cpu python-dotenv
!pip install -q pandas langchain langchain-openai langchain-community langchain-core openai faiss-cpu python-dotenv pytesseract

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### https://medium.com/data-science/improving-rag-performance-using-rerankers-6adda61b966d

In [18]:
import fitz
from PIL import Image
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import torch

In [20]:
# set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')



Using device: cpu


In [21]:
def parse_document(document_path: str):
  texts = []
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=40)

  pdf_document = fitz.open(document_path)
  page_numbers = list(range(1, 39))
  for page_number in page_numbers:
    page = pdf_document.load_page(page_number)
    pix = page.get_pixmap(dpi=300)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    text = pytesseract.image_to_string(img)
    chunked_texts = text_splitter.split_text(text)
    texts.extend(chunked_texts)
  return texts

def setup_embedding_model():
  tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-en-v1.5')
  model = AutoModel.from_pretrained('BAAI/bge-large-en-v1.5')
  model.eval()
  # model.to("cuda")
  return tokenizer, model

def create_embedding(texts, tokenizer, model):
  encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')#.to("cuda")
  with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings.tolist()

def setup_reranker():
  tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
  model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
  model.eval()
  # model.to("cuda")
  return tokenizer, model

def run_reraker(text_pairs, tokenizer, model):
  with torch.no_grad():
      inputs = tokenizer(text_pairs, padding=True, truncation=True, return_tensors='pt', max_length=512) #.to("cuda")
      scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
      return scores.tolist()

In [None]:
chunked_texts = parse_document("data/odyssey_stories.pdf")
embedding_tokenizer, embedding_model = setup_embedding_model()
embeddings = create_embedding(chunked_texts, embedding_tokenizer, embedding_model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# embeddings = create_embedding(chunked_texts, embedding_tokenizer, embedding_model)

In [None]:
query = "Why was Odysseus stuck with Calypso?"
query_embedding = create_embedding([query], embedding_tokenizer, embedding_model)

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(query_embedding, embeddings)
similarity = similarity[0]

indexed_numbers = list(enumerate(similarity))
sorted_indexed_numbers = sorted(indexed_numbers, key=lambda x: x[1], reverse=True)
sorted_indices = [index for index, number in sorted_indexed_numbers]



Original query: Why was Odysseus stuck with Calypso? 



NameError: name 'texts' is not defined

In [None]:
top_k = 10
print(f"Original query: {query} \n")
for i in sorted_indices[:top_k]:
  print(chunked_texts[i])
  print("\n")

Original query: Why was Odysseus stuck with Calypso? 

Odysseus was by nature a very shrewd and cautious man, and he feared that Calypso was contriving some
mischief against him, in revenge for his coldness. He looked at her doubtfully, and answered: "I fear thee,
nymph, and I mistrust thy purpose. How shall a man cross this dreadful gulf, where no ship is ever seen, on a


I

We have waited long for the appearancef of Odysseus, and at last he is about to enter the scene, which he will
never leave again until the final act of the great drama is played out. Hitherto he has been pursued by the malice
of Poseidon, who wrecked his fleet, drowned all his men, and kept him confined for seven years in Calypso's
island, in vengeance for the blinding of his son Polyphemus.


When he had heard Calypso's answer, Hermes took leave of her, and returned to Olympus, and the nymph went
down to the part of the shore where she knew Odysseus was accustomed to sit. There he would remain all day,
gazing te

In [None]:
reranker_tokenize, reranker_model = setup_reranker()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
pairs = []
for index in sorted_indices[:top_k]:
  pairs.append([query, chunked_texts[index]])

scores = run_reraker(pairs, reranker_tokenize, reranker_model)
paired_list = list(zip(sorted_indices[:top_k], scores))
sorted_paired_list = sorted(paired_list, key=lambda x: x[1], reverse=True)
reranked_indices = [index for index, value in sorted_paired_list]
reranked_values = [value for index, value in sorted_paired_list]

print(f"Original query: {query} \n")

for i in reranked_indices:
  print(chunked_texts[i])
  print("\n")

Original query: Why was Odysseus stuck with Calypso? 

So saying the goddess sank beneath the waves, leaving Odysseus with her veil in his hand. But that cautious
veteran did not at once act on her advice, for he feared that some treachery was intended against him. He
resolved therefore to remain on the raft as long as her timbers held together, and only to have recourse to the veil
in the last extremity.


I

We have waited long for the appearancef of Odysseus, and at last he is about to enter the scene, which he will
never leave again until the final act of the great drama is played out. Hitherto he has been pursued by the malice
of Poseidon, who wrecked his fleet, drowned all his men, and kept him confined for seven years in Calypso's
island, in vengeance for the blinding of his son Polyphemus.


Odysseus was by nature a very shrewd and cautious man, and he feared that Calypso was contriving some
mischief against him, in revenge for his coldness. He looked at her doubtfully, and ans