In [8]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from tqdm import tqdm
from embedding_model import Embedder
import chromadb
import torch

import math



In [10]:
math.ceil(3.9)

4

In [2]:
def augment_multiple_query(query, model="gpt-3.5-turbo"):
    messages = [
        {
            "role": "system",
            "content": SYSTEM_PROMT_QUERY_EXPANSION,
        },
        {"role": "user", "content": query}
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    content = content.split("\n")
    return content

### Usage


```bash
docker run tinyrag --persistent_storage [path]  
```


```bash
docker-compose up
```
### This should deploys a RAG API with 4 endpoints:

 - /tinyrag/upload_file
 
 - /tinyrag/upload_zip
 
 - /tinyrag/query$?expand
 
 - /tinyrag/reset
 
 
 Think more about it

In [77]:
class PdfChunksLoader_ChromaDB():
    def __init__(self, collection, embedder, text_splitter=None):
        
        
        
        self.collection = collection
        self.embedder = embedder
        self.id = 0
        self.text_splitter = text_splitter if text_splitter else RecursiveCharacterTextSplitter(chunk_size=1500, 
                                                                           chunk_overlap=100,
                                                                           separators=["\n", "\t", ".", ",", " ", ""],)
        
        
    def _extract_pdf_chunks(self, path):
        
        loader = PyPDFLoader(path)
        
        chunks = loader.load_and_split(text_splitter=self.text_splitter)
        
        return chunks
    
    def _get_chunk_id(self, chunk):
        
        return "uri" + str(abs(hash(chunk.page_content)))
        
    def filter_existing_docs(self, docs_ids_map):
        
        
        ids_computed = list(docs_ids_map.keys())
        
            
        existing_chunks_ids = rag.collection.get(ids=ids_computed)["ids"]
        
        
        def extract_only_new_docs(keyval_tuple):
            key, value = keyval_tuple
            
            return (key not in existing_chunks_ids)
            
        filtered_docs_map = dict(filter(extract_only_new_docs,  docs_ids_map.items()))
        
        return filtered_docs_map
        
        
    
    def populate(self, documents):
        ##TODO: add batch size for computing embeddings
        
        ### try to add one by one to avoid redundant computing of embedds
        
        
        #check filter ids
        
        ids_computed = [self._get_chunk_id(chunk) for chunk in documents]
        
        docs_id_map = {uri_id : doc for uri_id, doc in zip(ids_computed, documents)}
        
        filtered_docs_id_map = self.filter_existing_docs(docs_id_map)
        
        if (filtered_docs_id_map):
            self.collection.add(
                documents=[chunk.page_content for chunk in filtered_docs_id_map.values()],

                metadatas = [chunk.metadata for chunk in filtered_docs_id_map.values()],

                embeddings = self.embedder.compute_embeddings([chunk.page_content for chunk in filtered_docs_id_map.values()]).tolist(),
                ids = [doc_id for doc_id in filtered_docs_id_map.keys()]

                )
        else:
            print("Documents already exist...")



In [78]:


#Works for single collection
class RetrievalAugmentedGenerator():
    def __init__(self, db_client, embedder, collection_name):
        
        self.db_client = db_client
        self.embedder = embedder
        
        
        
        self.collection_name = collection_name
        self.collection = self.db_client.get_or_create_collection(name=self.collection_name)
        
        self.chunk_loader = PdfChunksLoader_ChromaDB(self.collection,
                                                     embedder)

    def upload_pdf_file(self, path_file, batch_size=5):
        ##Load chunks by batches
        
        docs = self.chunk_loader._extract_pdf_chunks(path_file)
        
        for i in tqdm(range(math.ceil(len(docs) / batch_size)), desc=f"[{path_file}] loading batches:"):
            self.chunk_loader.populate(docs[i * batch_size : (i + 1) * batch_size])
        
        
        print("All batches loaded successfully...")
    
    def upload_documents_from_list(self, documents):
        pass
    
    def upload_from_dir(self, path_dir):
        ##Load chunks by batches
        pass
    
    
    
    
    def query(self, queries, top_k, query_augment=True, cross_encoder_rank=True):
        pass
    
    def get(self, ids, where, limit):
        pass
    
    def reset_collection(self):
        pass
    
    
    
    

In [73]:
db_client = chromadb.PersistentClient(path="./persistent_storage")

#collection = db_client.get_collection("my_collection")
embedder =  Embedder(model_name='sentence-transformers/all-MiniLM-L12-v2',
                    tokenizer_name='sentence-transformers/all-MiniLM-L12-v2')



In [79]:
rag = RetrievalAugmentedGenerator(db_client, embedder, "hello")

In [85]:
rag.upload_pdf_file("library/UMAP paper.pdf")

[library/UMAP paper.pdf] loading batches:: 100%|█| 22/22 [00:00<00:00, 1361.89it

Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
Documents already exist...
All batches loaded successfully...





In [83]:
rag.collection.count()

281

In [41]:
rag.collection.get(ids=['uri8867991253060557058', 'uri7377985003227613794', "uri43"])["ids"]

['uri8867991253060557058', 'uri7377985003227613794']

In [35]:
help(rag.collection)

Help on Collection in module chromadb.api.models.Collection object:

class Collection(pydantic.main.BaseModel)
 |  Collection(client: 'API', name: str, id: uuid.UUID, embedding_function: Optional[chromadb.api.types.EmbeddingFunction] = <chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 object at 0x7fa445b5dcc0>, metadata: Optional[Dict[str, Any]] = None) -> None
 |  
 |  Method resolution order:
 |      Collection
 |      pydantic.main.BaseModel
 |      pydantic.utils.Representation
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, client: 'API', name: str, id: uuid.UUID, embedding_function: Optional[chromadb.api.types.EmbeddingFunction] = <chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 object at 0x7fa445b5dcc0>, metadata: Optional[Dict[str, Any]] = None)
 |      Create a new model by parsing and validating input data from keyword arguments.
 |      
 |      Raises ValidationError if the input data cannot be parsed to form a valid model.
 |  
 |  __r