In [1]:
from qdrant_client import QdrantClient,models
import fitz
import re
from uuid import uuid4
import tiktoken
import shutil
import numpy as np
import bm25s
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def num_tokens_from_string(string, encoding_name = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string("Hello world, let's test tiktoken."))

9


In [3]:
def read_pdf(path):
    doc = fitz.open(path)
    page_text_lst = [page.get_text("text",sort=True) for page in doc]
    return page_text_lst

In [4]:
# s = "apple, banana; orange grape"

# # Split using re.finditer to capture delimiters along with words
# matches = re.finditer(r'[^;,\s]+[;,\s]?', s)

# # Combine the words with their respective delimiters
# res = [match.group(0) for match in matches]

# print(res)


s = """apple, banana; orange grape.and 
you for the 
total"""

def split_docs(string):
    delimiters = [',', ';', '\n\n','\n','.']  # List of delimiters 

    # Create the regex pattern dynamically
    pattern = f"[^{''.join(delimiters)}]+[{'|'.join(delimiters)}]?"

    # Split using re.finditer
    matches = re.finditer(pattern, string)

    # Combine the words with their respective delimiters
    res = [match.group(0) for match in matches]
    return res

print(split_docs(s))


['apple,', ' banana;', ' orange grape.', 'and \n', 'you for the \n', 'total']


In [5]:
def parts_to_chunk(parts, chunk_size=300,min_chunk_size=50):
    chunk_1st = []
    chunk = ""
    for i in range(len(parts)): 
        sub_part = parts[i] 
        if num_tokens_from_string(sub_part+chunk) < chunk_size: 
            chunk+=sub_part 
            if i == len(parts)-1: 
                chunk_1st.append(chunk)
                break
        else:
            chunk_1st.append(chunk)
            chunk = sub_part
            if i == len(parts)-1:
                chunk_1st.append(chunk)
                break
    if num_tokens_from_string(chunk_1st[-1]) < min_chunk_size:
        last_chunk = chunk_1st.pop()
        chunk_1st[-1] = chunk_1st[-1]+last_chunk
    return chunk_1st

In [6]:
def find_page_break_pattern(chunk, pattern):
    next_page = False
    matches = re.finditer(pattern, chunk)
    for match in matches:
        value = int(match.group(1))
        if match.start() == 0:
            next_page = True
            return value, next_page

        return value, next_page

    return -1, False

pattern = r"!@#(\d+)!@#"

find_page_break_pattern('1@#251!@# total_txt !@#2851!@#ewtrqwr',pattern)

(2851, False)

In [7]:
def find_page_num(list_of_chunk_docs):
    prev_page=1
    page_details = []
    final_chunk_1st = []
    page_break_pattern = r"!@#(\d+)!@#" 
    for i in range(len(list_of_chunk_docs)):
        chunk = list_of_chunk_docs[i]#.page_content
        chunk_without_page_break = re.sub(page_break_pattern, "", chunk)
        page_num,next_page = find_page_break_pattern(chunk, page_break_pattern)

        if page_num == -1:
            # final_chunk = Document(page_content=chunk_without_page_break, metadata {"file_name":filename, "page_details": prev_page))
            page_details.append(prev_page)
            final_chunk_1st.append(chunk_without_page_break)
        else:
            if next_page:
                page_details.append(page_num+1)
                # final_chunk = Document(page_content=chunk_without_page_break, metadata = {"file_name":filename, "page_details":page_num+1))

                final_chunk_1st.append(chunk_without_page_break)
            else:
                page_details.append(page_num)
                #final_chunk = Document(page_content=chunk_without_page_break, metadata ("file_name": filename, "page_details":page_num))
                final_chunk_1st.append(chunk_without_page_break)

            prev_page = page_num+1

    return final_chunk_1st,page_details


In [None]:
path = "/home/debasish/Debun/agent/sample_pdfs/gemini 1.5.pdf"
page_chunk_lst = read_pdf(path)
total_text = "".join(page_chunk_lst[i].strip("\n")+f"!@#{i+1}!@#\n" for i in range(len(page_chunk_lst)))
parts = split_docs(total_text)
list_of_chunk_docs = parts_to_chunk(parts)
final_chunk_1st,page_details = find_page_num(list_of_chunk_docs)
uuids = [str(uuid4()) for _ in range(len(documents))]
for i,j in zip(final_chunk_1st,page_details):
    print(j,i)
    print("--------")

In [14]:
temp = sorted([num_tokens_from_string(i) for i in final_chunk_1st])
temp[:5],temp[-5:]

([254, 264, 266, 266, 272], [299, 299, 299, 299, 300])

In [37]:
def create_docs(chunks,pages,file_name,ids):
    metadata = [] # [{'page_no':i} for i in pages]
    documents = [] # [doc for doc in chunks]
    corpus_json = []

    for doc,page_no,id in zip(chunks,pages,ids):
        documents.append(doc)
        each_metadata = {'page_no':page_no,"file_name":file_name}
        metadata.append(each_metadata)
        each_dict = {'page_content':doc,"metadata":{'page_no':page_no,"file_name":file_name,"id":id}}
        corpus_json.append(each_dict)
    return documents, metadata, corpus_json


In [42]:
def create_qdrant_dense_emd(documents,metadata,ids,emd_path,collection_name):
    if os.path.exists(emd_path): 
        shutil.rmtree(emd_path)

    client = QdrantClient(path = emd_path)
    client.set_model("BAAI/bge-base-en-v1.5")

    if not client.collection_exists("startups"):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=client.get_fastembed_vector_params()
        )
    # uuids = [str(uuid4()) for _ in range(len(chunks))]

    # metadata = [{'page_no':i} for i in pages]
    # documents = [doc for doc in chunks]

    client.add(
    collection_name=collection_name,
    documents=documents,
    metadata=metadata,
    ids=ids,
    parallel=0,  # Use all available CPU cores to encode data.
    # Requires wrapping code into if __name__ == '__main__' block
    )
    return client
    
def create_bm25s_db(corpus_json):
    corpus_text = [doc['page_content'] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text,stopwords='en')
    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens)
    return retriever

In [75]:
def weighted_rrf(rank_lists, weights, alpha=60, default_rank=1000, k=5):
    all_items = set(item for rank_list in rank_lists for item,_ in rank_list)
    item_to_index = {item: idx for idx, item in enumerate(all_items)}
    rank_matrix = np.full((len(all_items), len(rank_lists)), default_rank)
    for list_idx, rank_list in enumerate(rank_lists):
        for item, rank in rank_list:
            rank_matrix[item_to_index[item], list_idx] = rank

    weighted_rrf_scores = np.sum(weights*(1.0/(alpha + rank_matrix)), axis=1)
    sorted_indices = np.argsort(-weighted_rrf_scores) # Negative for descending order
    sorted_items = [(list(item_to_index.keys()) [idx], weighted_rrf_scores [idx]) for idx in sorted_indices]

    return sorted_items[:k]


def get_doc_and_source(rrf_retriever, retrieve_doc_dict_keyword, retrieve_doc_dict_sim_search):
    final_retrieve_lst = []
    unique_source = []
    all_source= []

    for final_retrieve_doc_with_score in rrf_retriever:
        final_retrieve_doc = final_retrieve_doc_with_score[0]
        final_retrieve_lst.append(final_retrieve_doc)

        if final_retrieve_doc in list(retrieve_doc_dict_keyword.keys()): 
            source = retrieve_doc_dict_keyword[final_retrieve_doc]
            all_source.append(source)
            if source not in unique_source:
                unique_source.append(source)

        elif final_retrieve_doc in list(retrieve_doc_dict_sim_search.keys()):
            source = retrieve_doc_dict_sim_search[final_retrieve_doc] 
            all_source.append(source) 
            if source not in unique_source:
                unique_source.append(source) 
    return final_retrieve_lst, unique_source, all_source


def prepare_retrieve_doc(final_retrieve_lst,all_source):
    i = 1
    context = ""
    for doc,source in zip(final_retrieve_lst,all_source):
        context += doc.strip("\n")
        context += "\n-----------------\n"
        i+=1
    return context.strip("\n")

In [61]:
keyword_retriever = create_bm25s_db(corpus_json)
retrieve_doc,retrieve_doc_dict = keyword_search(query,keyword_retriever)

# retrieve_doc,retrieve_doc_dict = similarity_search(query,client,collection_name,k=5)

                                                             

In [79]:
def similarity_search(query,client,collection_name,k=5):
    retrieve_doc = []
    retrieve_doc_dict = {}
    retrieve_docs = client.query(collection_name = collection_name,query_text = query,limit = k)
    rank = 1
    for doc in retrieve_docs:
        id = doc.id
        page_content = doc.metadata['document']
        metadata = {'id':id,'page_no':doc.metadata['page_no'],'file_name':doc.metadata['file_name']}
        retrieve_doc.append((page_content,rank))
        file_name = metadata['file_name']
        page_no = metadata['page_no']
        path = os.path.join("uploaded_files",file_name)
        source = f"{path}#page={page_no}"
        retrieve_doc_dict[page_content] = source
        rank+=1
    return retrieve_doc,retrieve_doc_dict

def keyword_search(query,keyword_retriever,k=5):
    # keyword_retriever = bm25s.BM25.load(path,load_corpus=True)
    query_tokens = bm25s.tokenize(query)
    results,scores = keyword_retriever.retrieve(query_tokens,k=k)
    retrieve_doc = []
    retrieve_doc_dict = {}
    rank = 1
    for doc in results[0]:
        page_content = doc['page_content']
        metadata = doc['metadata']
        retrieve_doc.append((page_content,rank))
        file_name = metadata['file_name']
        page_no = metadata['page_no']
        path = os.path.join("uploaded_files",file_name)
        source = f"{path}#page={page_no}"
        retrieve_doc_dict[page_content] = source
        rank+=1
    return retrieve_doc,retrieve_doc_dict

In [82]:
def custom_ensemble_retriever (query,k, weights,client,collection_name,keyword_retriever):
    retrieve_doc_sim_search, retrieve_doc_dict_sim_search = similarity_search(query,client,collection_name,k=5)
    retrieve_doc_keyword, retrieve_doc_dict_keyword = keyword_search(query,keyword_retriever)

    weights = np.array(weights)
    rrf_retriever = weighted_rrf([retrieve_doc_keyword, retrieve_doc_sim_search], weights, k=k)
    final_retrieve_lst, unique_source, all_source = get_doc_and_source(rrf_retriever, retrieve_doc_dict_keyword, retrieve_doc_dict_sim_search) 
    retrieve_context = prepare_retrieve_doc(final_retrieve_lst, all_source)

    return retrieve_context, unique_source,all_source


In [87]:
query = """Large language models have been showed to reproduce and amplify biases that are existing in
the training data (Sheng et al., 2019; Kurita et al.,
2019), and to generate toxic or offensive content (Gehman et al., 2020). 
"""
retrieve_context, unique_source,all_source = custom_ensemble_retriever(query=query,k=5,weights=weights,
                                                            client=client,collection_name=collection_name,
                                                            keyword_retriever=keyword_retriever)

                                                     

In [88]:
all_source

['uploaded_files/temp.pdf#page=7',
 'uploaded_files/temp.pdf#page=8',
 'uploaded_files/temp.pdf#page=7',
 'uploaded_files/temp.pdf#page=11',
 'uploaded_files/temp.pdf#page=6']

In [None]:
print(retrieve_context)

In [None]:
doc = client.query(collection_name = collection_name,query_text = query,limit = 5)
metadata = [(hit.id,hit.metadata) for hit in doc]
metadata

In [24]:
print(doc[0].metadata['document']

'output. We use the RMSNorm normalizing func-\ntion, introduced by Zhang and Sennrich (2019).      2.4  Efﬁcient implementation\nSwiGLU activation function [PaLM]. We re-  We make several optimizations to improve the train-\nplace the ReLU non-linearity by the SwiGLU ac-   ing speed of our models. First, we use an efﬁcient\ntivation function, introduced by Shazeer (2020) to   implementation of the causal multi-head attention\nimprove the performance. We use a dimension of    to reduce memory usage and runtime. This imple-\n234d instead of 4d as in PaLM.                      mentation, available in the xformers library,2 is\n                                                     inspired by Rabe and Staats (2021) and uses the\nRotary Embeddings [GPTNeo]. We remove the                                             backward from Dao et al. (2022). This is achieved\nabsolute positional embeddings, and instead, add                                          by not storing the attention weights an

In [40]:
doc[0]

QueryResponse(id='5c3cd82d54b947488694965eaff735dc', embedding=None, sparse_embedding=None, metadata={'document': 'output. We use the RMSNorm normalizing func-\ntion, introduced by Zhang and Sennrich (2019).      2.4  Efﬁcient implementation\nSwiGLU activation function [PaLM]. We re-  We make several optimizations to improve the train-\nplace the ReLU non-linearity by the SwiGLU ac-   ing speed of our models. First, we use an efﬁcient\ntivation function, introduced by Shazeer (2020) to   implementation of the causal multi-head attention\nimprove the performance. We use a dimension of    to reduce memory usage and runtime. This imple-\n234d instead of 4d as in PaLM.                      mentation, available in the xformers library,2 is\n                                                     inspired by Rabe and Staats (2021) and uses the\nRotary Embeddings [GPTNeo]. We remove the                                             backward from Dao et al. (2022). This is achieved\nabsolute positi

In [18]:
doc = client.search(collection_name="startups",query)
doc

TypeError: QdrantClient.search() missing 2 required positional arguments: 'collection_name' and 'query_vector'

In [16]:
class SemanticSearcher:
    DENSE_MODEL = "BAAI/bge-base-en-v1.5"
    # SPARSE_MODEL = "prithivida/Splade_PP_en_v1"
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # initialize Qdrant client
        self.qdrant_client = QdrantClient(path="dense")
        self.qdrant_client.set_model(self.DENSE_MODEL)
        # comment this line to use dense vectors only
        # self.qdrant_client.set_sparse_model(self.SPARSE_MODEL)
    def search(self, text: str):
      search_result = self.qdrant_client.query(
          collection_name=self.collection_name,
          query_text=text,
          query_filter=None,  # If you don't want any filters for now
          limit=5,  # 5 the closest results
      )
      # `search_result` contains found vector ids with similarity scores
      # along with the stored payload

      # Select and return metadata
      metadata = [hit.metadata for hit in search_result]
      return metadata

In [17]:
query = 'We make several optimizations to improve the training speed of our models. First, we use an efficient implementation of the causal multi-head attention'
hybrid_searcher = SemanticSearcher(collection_name="startups")
doc = hybrid_searcher.search(text=query)
doc