In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from tqdm import tqdm
from embedding_model import Embedder
import chromadb
import torch
import hashlib

import math



##### USE HASH library for persistent ids assignment

https://cookbook.chromadb.dev/core/document-ids/#hashes

In [2]:
def generate_sha256_hash_from_text(text):
    # Create a SHA256 hash object
    sha256_hash = hashlib.sha256()
    # Update the hash object with the text encoded to bytes
    sha256_hash.update(text.encode('utf-8'))
    # Return the hexadecimal representation of the hash
    return sha256_hash.hexdigest()


In [3]:
generate_sha256_hash_from_text("Hello Worl3232d!")

'3eb787d0b3d7fe92710642d8b377187a8f320eaf6d6e4e015d2a5a150477c8bf'

In [4]:
def augment_multiple_query(query, model="gpt-3.5-turbo"):
    messages = [
        {
            "role": "system",
            "content": SYSTEM_PROMT_QUERY_EXPANSION,
        },
        {"role": "user", "content": query}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    content = content.split("\n")
    return content

### Usage


```bash
docker run tinyrag --persistent_storage [path]  
```


```bash
docker-compose up
```
### This should deploys a RAG API with 4 endpoints:

 - /tinyrag/upload_file
 
 - /tinyrag/upload_zip
 
 - /tinyrag/query$?expand
 
 - /tinyrag/reset
 
 
 Think more about it

In [5]:
class PdfChunksLoader_ChromaDB():
    def __init__(self, collection, embedder, text_splitter=None):
        
        
        
        self.collection = collection
        self.embedder = embedder
        self.id = 0
        self.text_splitter = text_splitter if text_splitter else RecursiveCharacterTextSplitter(chunk_size=1500, 
                                                                           chunk_overlap=100,
                                                                           separators=["\n", "\t", ".", ",", " ", ""],)
        
        
    def _extract_pdf_chunks(self, path):
        
        loader = PyPDFLoader(path)
        
        chunks = loader.load_and_split(text_splitter=self.text_splitter)
        
        return chunks
    
    def _get_chunk_id(self, chunk):
        
        return "chunkID_" + generate_sha256_hash_from_text(chunk.page_content)
        
    def filter_existing_docs(self, docs_ids_map):
        
        
        ids_computed = list(docs_ids_map.keys())
        
            
        existing_chunks_ids = rag.collection.get(ids=ids_computed)["ids"]
        
        
        def extract_only_new_docs(keyval_tuple):
            key, value = keyval_tuple
            
            return (key not in existing_chunks_ids)
            
        filtered_docs_map = dict(filter(extract_only_new_docs,  docs_ids_map.items()))
        
        return filtered_docs_map
        
        
    
    def populate(self, documents):
        ##TODO: add batch size for computing embeddings
        
        ### try to add one by one to avoid redundant computing of embedds
        
        
        #check filter ids
        
        ids_computed = [self._get_chunk_id(chunk) for chunk in documents]
        
        docs_id_map = {uri_id : doc for uri_id, doc in zip(ids_computed, documents)}
        
        filtered_docs_id_map = self.filter_existing_docs(docs_id_map)
        
        if (filtered_docs_id_map):
            self.collection.add(
                documents=[chunk.page_content for chunk in filtered_docs_id_map.values()],

                metadatas = [chunk.metadata for chunk in filtered_docs_id_map.values()],

                embeddings = self.embedder.compute_embeddings([chunk.page_content for chunk in filtered_docs_id_map.values()]).tolist(),
                ids = [doc_id for doc_id in filtered_docs_id_map.keys()]

                )
        else:
            print("Documents already exist...")



In [6]:

class RetrievalAugmentedGenerator():
    def __init__(self, db_client, embedder, collection_name):
        
        self.db_client = db_client
        self.embedder = embedder
        
        self.collection_name = collection_name
        self.collection = self.db_client.get_or_create_collection(name=self.collection_name)
        
        self.chunk_loader = PdfChunksLoader_ChromaDB(self.collection,
                                                     embedder)

    def upload_pdf_file(self, path_file, batch_size=5):
        ##Load chunks by batches
        
        docs = self.chunk_loader._extract_pdf_chunks(path_file)
        
        for i in tqdm(range(math.ceil(len(docs) / batch_size)), desc=f"[{path_file}] loading batches:"):
            self.chunk_loader.populate(docs[i * batch_size : (i + 1) * batch_size])
        
        
        print(f"[{path_file}]: All batches loaded successfully...")
    
    def query_with_embeddings(self, embeddings, top_k):
        
        return self.collection.query(query_embeddings=embeddings,
                                      n_results=top_k)
    
    def query_with_text(self, queries, top_k):
        
        #compute embeddings
        
        embeddings_tensor = self.embedder.compute_embeddings(queries)
        embeddings_list = embeddings_tensor.tolist()
        
        
        return self.query_with_embeddings(embeddings_list, top_k)
    
    
    def get(self, ids, where, limit):
        pass
    
    def reset_collection(self):
        pass
    
    
    
    

In [20]:
#db_client = chromadb.PersistentClient(path="./persistent_storage")

db_client = chromadb.HttpClient(host="localhost", port=8000)

#collection = db_client.get_collection("my_collection")
embedder =  Embedder(model_name='sentence-transformers/all-MiniLM-L12-v2',
                    tokenizer_name='sentence-transformers/all-MiniLM-L12-v2')



In [23]:
rag = RetrievalAugmentedGenerator(db_client, embedder, "default_collection")

In [24]:
rag.upload_pdf_file("library/pthreads.pdf")

[library/pthreads.pdf] loading batches:: 100%|██| 14/14 [01:06<00:00,  4.73s/it]

[library/pthreads.pdf]: All batches loaded successfully...





In [10]:
rag.collection.count()

66

In [11]:
rag.query_with_text(["What is mutex used for in threads?", "Avoiding Data Race by locking"], top_k=3)

{'ids': [['chunkID_c3613574763888c3b28c699e2a8fa21004cfc74ac352ab83df9035ad6e9e7b7f',
   'chunkID_56dfddbcb19ae4d535a82c302dc4faa812b64bd3f3dd00cf192d7b0c3f1aceba',
   'chunkID_c9cb4bedf5d6e8b5a4ad72df13f5d80a98054dd304c1ae6ff559fd5ae912b9c7'],
  ['chunkID_8d6f1a32111a6052e0f502c611a5a4f968071da4ca9378a1a2462fd90c08b611',
   'chunkID_3bad18de465ac18c3c4bff592f1585e93c374922f724abf02c6eb91ba345f376',
   'chunkID_f21b8a38a47df7701d585fc11541f74988f9f9a32a8862e39f13e8b442923e51']],
 'distances': [[0.5447016473704277, 0.6239121965415511, 0.6412980707355009],
  [0.9459152660459222, 0.9747333002740871, 1.0722717537353825]],
 'embeddings': None,
 'metadatas': [[{'page': 18, 'source': 'library/pthreads.pdf'},
   {'page': 22, 'source': 'library/pthreads.pdf'},
   {'page': 19, 'source': 'library/pthreads.pdf'}],
  [{'page': 18, 'source': 'library/pthreads.pdf'},
   {'page': 19, 'source': 'library/pthreads.pdf'},
   {'page': 25, 'source': 'library/pthreads.pdf'}]],
 'documents': [['it to pthread_