In [1]:
from qdrant_client import QdrantClient,models
import fitz
import re
from uuid import uuid4
import tiktoken
import shutil
import numpy as np
import bm25s
import os

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
def num_tokens_from_string(string, encoding_name = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

print(num_tokens_from_string("Hello world, let's test tiktoken."))

9


In [7]:
def read_pdf(path):
    doc = fitz.open(path)
    page_text_lst = [page.get_text("text",sort=True) for page in doc]
    return page_text_lst

In [8]:
# s = "apple, banana; orange grape"

# # Split using re.finditer to capture delimiters along with words
# matches = re.finditer(r'[^;,\s]+[;,\s]?', s)

# # Combine the words with their respective delimiters
# res = [match.group(0) for match in matches]

# print(res)


s = """apple, banana; orange grape.and 
you for the 
total"""

def split_docs(string):
    delimiters = [',', ';', '\n\n','\n','.']  # List of delimiters 

    # Create the regex pattern dynamically
    pattern = f"[^{''.join(delimiters)}]+[{'|'.join(delimiters)}]?"

    # Split using re.finditer
    matches = re.finditer(pattern, string)

    # Combine the words with their respective delimiters
    res = [match.group(0) for match in matches]
    return res

print(split_docs(s))


['apple,', ' banana;', ' orange grape.', 'and \n', 'you for the \n', 'total']


In [9]:
def parts_to_chunk(parts, chunk_size=300,min_chunk_size=50):
    chunk_1st = []
    chunk = ""
    for i in range(len(parts)): 
        sub_part = parts[i] 
        if num_tokens_from_string(sub_part+chunk) < chunk_size: 
            chunk+=sub_part 
            if i == len(parts)-1: 
                chunk_1st.append(chunk)
                break
        else:
            chunk_1st.append(chunk)
            chunk = sub_part
            if i == len(parts)-1:
                chunk_1st.append(chunk)
                break
    if num_tokens_from_string(chunk_1st[-1]) < min_chunk_size:
        last_chunk = chunk_1st.pop()
        chunk_1st[-1] = chunk_1st[-1]+last_chunk
    return chunk_1st

In [10]:
def find_page_break_pattern(chunk, pattern):
    next_page = False
    matches = re.finditer(pattern, chunk)
    for match in matches:
        value = int(match.group(1))
        if match.start() == 0:
            next_page = True
            return value, next_page

        return value, next_page

    return -1, False

pattern = r"!@#(\d+)!@#"

find_page_break_pattern('1@#251!@# total_txt !@#2851!@#ewtrqwr',pattern)

(2851, False)

In [11]:
def find_page_num(list_of_chunk_docs):
    prev_page=1
    page_details = []
    final_chunk_1st = []
    page_break_pattern = r"!@#(\d+)!@#" 
    for i in range(len(list_of_chunk_docs)):
        chunk = list_of_chunk_docs[i]#.page_content
        chunk_without_page_break = re.sub(page_break_pattern, "", chunk)
        page_num,next_page = find_page_break_pattern(chunk, page_break_pattern)

        if page_num == -1:
            # final_chunk = Document(page_content=chunk_without_page_break, metadata {"file_name":filename, "page_details": prev_page))
            page_details.append(prev_page)
            final_chunk_1st.append(chunk_without_page_break)
        else:
            if next_page:
                page_details.append(page_num+1)
                # final_chunk = Document(page_content=chunk_without_page_break, metadata = {"file_name":filename, "page_details":page_num+1))

                final_chunk_1st.append(chunk_without_page_break)
            else:
                page_details.append(page_num)
                #final_chunk = Document(page_content=chunk_without_page_break, metadata ("file_name": filename, "page_details":page_num))
                final_chunk_1st.append(chunk_without_page_break)

            prev_page = page_num+1

    return final_chunk_1st,page_details


In [12]:
path = "/home/debasish/Debun/agent/sample_pdfs/gemini 1.5.pdf"
page_chunk_lst = read_pdf(path)
total_text = "".join(page_chunk_lst[i].strip("\n")+f"!@#{i+1}!@#\n" for i in range(len(page_chunk_lst)))
parts = split_docs(total_text)
list_of_chunk_docs = parts_to_chunk(parts)
final_chunk_1st,page_details = find_page_num(list_of_chunk_docs)
uuids = [str(uuid4()) for _ in range(len(final_chunk_1st))]
for i,j in zip(final_chunk_1st,page_details):
    print(j,i)
    print("--------")

1     Gemini 1.5: Unlocking multimodal
     understanding across millions of tokens of
     context
          Gemini Team, Google1
           In this report, we introduce the Gemini 1.5 family of models, representing the next generation of highly
           compute-efficient multimodal models capable of recalling and reasoning over fine-grained information
         from millions of tokens of context, including multiple long documents and hours of video and audio. The
           family includes two new models: (1) an updated Gemini 1.5 Pro, which exceeds the February version on
          the great majority of capabilities and benchmarks; (2) Gemini 1.5 Flash, a more lightweight variant
          designed for efficiency with minimal regression in quality. Gemini 1.5 models achieve near-perfect2024      recall on long-context retrieval tasks across modalities, improve the state-of-the-art in long-document
         QA, long-video QA and long-context ASR, and match or surpass Gemini 1.0 Ult

In [13]:
temp = sorted([num_tokens_from_string(i) for i in final_chunk_1st])
temp[:5],temp[-5:]

([254, 264, 266, 266, 272], [299, 299, 299, 299, 300])

In [14]:
def create_docs(chunks,pages,file_name,ids):
    metadata = [] # [{'page_no':i} for i in pages]
    documents = [] # [doc for doc in chunks]
    corpus_json = []

    for doc,page_no,id in zip(chunks,pages,ids):
        documents.append(doc)
        each_metadata = {'page_no':page_no,"file_name":file_name}
        metadata.append(each_metadata)
        each_dict = {'page_content':doc,"metadata":{'page_no':page_no,"file_name":file_name,"id":id}}
        corpus_json.append(each_dict)
    return documents, metadata, corpus_json

In [15]:
def final_chunking_pipeline(path):
    page_chunk_lst = read_pdf(path)
    total_text = "".join(page_chunk_lst[i].strip("\n")+f"!@#{i+1}!@#\n" for i in range(len(page_chunk_lst)))
    parts = split_docs(total_text)
    list_of_chunk_docs = parts_to_chunk(parts)
    final_chunk_1st,page_details = find_page_num(list_of_chunk_docs)
    uuids = [str(uuid4()) for _ in range(len(list_of_chunk_docs))]
    documents, metadata, corpus_json = create_docs(list_of_chunk_docs,page_details,path,uuids)
    return documents, metadata, corpus_json,uuids

In [16]:
def create_qdrant_dense_emd(documents,metadata,ids,emd_path,collection_name):
    if os.path.exists(emd_path): 
        shutil.rmtree(emd_path)

    client = QdrantClient(path = emd_path)
    client.set_model("BAAI/bge-base-en-v1.5")

    if not client.collection_exists("startups"):
        client.create_collection(
            collection_name=collection_name,
            vectors_config=client.get_fastembed_vector_params()
        )
    # uuids = [str(uuid4()) for _ in range(len(chunks))]

    # metadata = [{'page_no':i} for i in pages]
    # documents = [doc for doc in chunks]

    client.add(
    collection_name=collection_name,
    documents=documents,
    metadata=metadata,
    ids=ids,
    parallel=0,  # Use all available CPU cores to encode data.
    # Requires wrapping code into if __name__ == '__main__' block
    )
    return client
    
def create_bm25s_db(corpus_json):
    corpus_text = [doc['page_content'] for doc in corpus_json]
    corpus_tokens = bm25s.tokenize(corpus_text,stopwords='en')
    retriever = bm25s.BM25(corpus=corpus_json)
    retriever.index(corpus_tokens)
    return retriever

In [17]:
def weighted_rrf(rank_lists, weights, alpha=60, default_rank=1000, k=5):
    all_items = set(item for rank_list in rank_lists for item,_ in rank_list)
    item_to_index = {item: idx for idx, item in enumerate(all_items)}
    rank_matrix = np.full((len(all_items), len(rank_lists)), default_rank)
    for list_idx, rank_list in enumerate(rank_lists):
        for item, rank in rank_list:
            rank_matrix[item_to_index[item], list_idx] = rank

    weighted_rrf_scores = np.sum(weights*(1.0/(alpha + rank_matrix)), axis=1)
    sorted_indices = np.argsort(-weighted_rrf_scores) # Negative for descending order
    sorted_items = [(list(item_to_index.keys()) [idx], weighted_rrf_scores [idx]) for idx in sorted_indices]

    return sorted_items[:k]


def get_doc_and_source(rrf_retriever, retrieve_doc_dict_keyword, retrieve_doc_dict_sim_search):
    final_retrieve_lst = []
    unique_source = []
    all_source= []

    for final_retrieve_doc_with_score in rrf_retriever:
        final_retrieve_doc = final_retrieve_doc_with_score[0]
        final_retrieve_lst.append(final_retrieve_doc)

        if final_retrieve_doc in list(retrieve_doc_dict_keyword.keys()): 
            source = retrieve_doc_dict_keyword[final_retrieve_doc]
            all_source.append(source)
            if source not in unique_source:
                unique_source.append(source)

        elif final_retrieve_doc in list(retrieve_doc_dict_sim_search.keys()):
            source = retrieve_doc_dict_sim_search[final_retrieve_doc] 
            all_source.append(source) 
            if source not in unique_source:
                unique_source.append(source) 
    return final_retrieve_lst, unique_source, all_source


def prepare_retrieve_doc(final_retrieve_lst,all_source):
    i = 1
    context = ""
    for doc,source in zip(final_retrieve_lst,all_source):
        context += doc.strip("\n")
        context += "\n-----------------\n"
        i+=1
    return context.strip("\n")

In [18]:
def similarity_search(query,client,collection_name,k=5):
    retrieve_doc = []
    retrieve_doc_dict = {}
    retrieve_docs = client.query(collection_name = collection_name,query_text = query,limit = k)
    rank = 1
    for doc in retrieve_docs:
        id = doc.id
        page_content = doc.metadata['document']
        metadata = {'id':id,'page_no':doc.metadata['page_no'],'file_name':doc.metadata['file_name']}
        retrieve_doc.append((page_content,rank))
        file_name = metadata['file_name']
        page_no = metadata['page_no']
        path = os.path.join("uploaded_files",file_name)
        source = f"{path}#page={page_no}"
        retrieve_doc_dict[page_content] = source
        rank+=1
    return retrieve_doc,retrieve_doc_dict

def keyword_search(query,keyword_retriever,k=5):
    # keyword_retriever = bm25s.BM25.load(path,load_corpus=True)
    query_tokens = bm25s.tokenize(query)
    results,scores = keyword_retriever.retrieve(query_tokens,k=k)
    retrieve_doc = []
    retrieve_doc_dict = {}
    rank = 1
    for doc in results[0]:
        page_content = doc['page_content']
        metadata = doc['metadata']
        retrieve_doc.append((page_content,rank))
        file_name = metadata['file_name']
        page_no = metadata['page_no']
        path = os.path.join("uploaded_files",file_name)
        source = f"{path}#page={page_no}"
        retrieve_doc_dict[page_content] = source
        rank+=1
    return retrieve_doc,retrieve_doc_dict

In [19]:
def custom_ensemble_retriever (query,k, weights,client,collection_name,keyword_retriever):
    retrieve_doc_sim_search, retrieve_doc_dict_sim_search = similarity_search(query,client,collection_name,k=5)
    retrieve_doc_keyword, retrieve_doc_dict_keyword = keyword_search(query,keyword_retriever)

    weights = np.array(weights)
    rrf_retriever = weighted_rrf([retrieve_doc_keyword, retrieve_doc_sim_search], weights, k=k)
    final_retrieve_lst, unique_source, all_source = get_doc_and_source(rrf_retriever, retrieve_doc_dict_keyword, retrieve_doc_dict_sim_search) 
    retrieve_context = prepare_retrieve_doc(final_retrieve_lst, all_source)

    return retrieve_context, unique_source,all_source


In [20]:
path = "/home/debasish/Debun/agent/sample_pdfs/LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf"
documents, metadata, corpus_json,uuids = final_chunking_pipeline(path)
print(len(documents),len(metadata),len(corpus_json),len(uuids))

100 100 100 100


In [21]:
emd_path = 'experiment'
collection_name = 'startups'
qdrant_client = create_qdrant_dense_emd(documents,metadata,uuids,emd_path,collection_name)
keyword_retriever =  create_bm25s_db(corpus_json)

                                                             

In [22]:
k=5
query = """Is there anything instruction finetuning happening
"""
weights = [0.6,0.4]
retrieve_context, unique_source,all_source = custom_ensemble_retriever(query=query,k=k,weights=weights,
                                                            client=qdrant_client,collection_name=collection_name,
                                                            keyword_retriever=keyword_retriever)
all_source

                                                     

['/home/debasish/Debun/agent/sample_pdfs/LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf#page=7',
 '/home/debasish/Debun/agent/sample_pdfs/LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf#page=7',
 '/home/debasish/Debun/agent/sample_pdfs/LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf#page=20',
 '/home/debasish/Debun/agent/sample_pdfs/LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf#page=6',
 '/home/debasish/Debun/agent/sample_pdfs/LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf#page=18']

In [9]:
a = [1,2,3]
b = [3,4,5]
a.index(3)

2

In [28]:
pattern = r'([^/]+\.pdf)#page=(\d+)'

# Extract the match

match = re.search(pattern, all_source[3])

if match:
    pdf_file_name = match.group(1)
    page_number = int(match.group(2))
    print(pdf_file_name,page_number)

LLaMA_Open_and_Efficient_Foundation_Language_Model.pdf 6


In [5]:
from groq import Groq
import os
from dotenv import load_dotenv
client = Groq(api_key=os.getenv("groq_api_key"))

In [27]:
from collections import OrderedDict

def update_qa_dict(question, answer, qa_dict, n):
    if not isinstance(qa_dict, OrderedDict):
        raise TypeError("qa_dict must be an OrderedDict.")

    if len(qa_dict) >= n:
        qa_dict.popitem(last=False)  # Remove the first (oldest) item.
    
    qa_dict[question] = answer  # Add the new question-answer pair.
    return qa_dict

# Example usage
if __name__ == "__main__":
    qa_dict = OrderedDict()  # Initialize an empty ordered dictionary
    n = 3  # Maximum dictionary length

    # Adding some question-answer pairs
    qa_dict = update_qa_dict("What is AI?", "Artificial Intelligence", qa_dict, n)
    qa_dict = update_qa_dict("What is ML?", "Machine Learning", qa_dict, n)
    qa_dict = update_qa_dict("What is NLP?", "Natural Language Processing", qa_dict, n)
    print("Before adding a new entry:", qa_dict)

    # Adding another question-answer pair when the dictionary is full
    qa_dict = update_qa_dict("What is CV?", "Computer Vision", qa_dict, n)
    print("After adding a new entry:", qa_dict)


Before adding a new entry: OrderedDict([('What is AI?', 'Artificial Intelligence'), ('What is ML?', 'Machine Learning'), ('What is NLP?', 'Natural Language Processing')])
After adding a new entry: OrderedDict([('What is ML?', 'Machine Learning'), ('What is NLP?', 'Natural Language Processing'), ('What is CV?', 'Computer Vision')])


In [35]:
for q,a in qa_dict.items():
    print(q,a)

What is ML? Machine Learning
What is NLP? Natural Language Processing
What is CV? Computer Vision


In [32]:
qa_dict.values()

odict_values(['Machine Learning', 'Natural Language Processing', 'Computer Vision'])

In [6]:

chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": "Give the one line answer to the given question"},
        {"role": "user", "content": "What is machine learning"},
        {"role":"assistant","content":"Machine learning is a subset of artificial intelligence that involves training algorithms to learn from data and make predictions or decisions without being explicitly programmed."},
        {"role": "user", "content": "tell me some algorithms"}
    ],
    model="llama-3.3-70b-versatile",
    temperature=0.2
)

print(chat_completion.choices[0].message.content)

Some common machine learning algorithms include Linear Regression, Decision Trees, Random Forest, Support Vector Machines (SVM), K-Means Clustering, and Neural Networks.


In [26]:

def create_message(query,context):
    messages = [
        {"role": "system", "content": "Your task is to analyze the users query and give the answer crisply based on the given context. If you can not find the answer politely say 'No information is available to generate the answer' "},
        {"role": "user", "content": f"Here is the user query\n{query}\nHere is the relevant context separated by -----------------\n{context}"}
    ]
    return messages
query = "burn fire is UK"
messages = create_message(query,retrieve_context)
chat_completion = client.chat.completions.create(
    messages=messages,
    model="llama-3.3-70b-versatile",
    temperature=0.2
)

print(chat_completion.choices[0].message.content)

No information is available to generate the answer. The provided context does not contain any information related to "burn fire" in the UK.
