In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from tqdm import tqdm
from embedding_model import Embedder
import chromadb
import torch

import ra



loader = PyPDFLoader("library/pthreads.pdf")
pages = loader.load_and_split(text_splitter=RecursiveCharacterTextSplitter(chunk_size=1500, 
                                                                           chunk_overlap=100,
                                                                           separators=["\n", "\t", ".", ",", " ", ""],
                                                                           ))

In [2]:
os.listdir("library/")

['Science_of_Wellbeing_JTF_1.pdf',
 'DTSA 5509 -annasanders.pdf',
 'pthreads.pdf',
 'UMAP paper.pdf']

In [38]:
class PdfChunksLoader_ChromaDB():
    def __init__(self, collection, embedder):
        
        self.collection = collection
        self.embedder = embedder
        self.id = 0
        
        
    def _extract_pdf_chunks(self, path, text_splitter):
        
        loader = PyPDFLoader(path)
        
        chunks = loader.load_and_split(text_splitter=text_splitter)
        
        return chunks
        
        
    def populate(self, documents):
        ##TODO: add batch size for computing embeddings
        
        self.collection.add(
            documents=[chunk.page_content for chunk in tqdm(documents)],
            
            metadatas = [chunk.metadata for chunk in tqdm(documents)],
            
            embeddings = self.embedder.compute_embeddings([chunk.page_content for chunk in documents]).tolist(),
            ids = ["uri" + str(abs(hash(chunk.page_content))) for chunk in tqdm(documents)]
            
            )
 
        print("Documents loaded to DB")
        

In [6]:

db_client = chromadb.PersistentClient(path="./persistent_storage")

collection = db_client.get_collection("my_collection")


In [7]:
embedder = Embedder(model_name='sentence-transformers/all-MiniLM-L12-v2',
                    tokenizer_name='sentence-transformers/all-MiniLM-L12-v2')

2024-03-15 15:08:19.766119: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [39]:
chunkloader = PdfChunksLoader_ChromaDB(collection, embedder)


docs = chunkloader._extract_pdf_chunks("library/pthreads.pdf",  RecursiveCharacterTextSplitter(chunk_size=1500, 
                                                                           chunk_overlap=100,
                                                                           separators=["\n", "\t", ".", ",", " ", ""],
                                                                           ))


chunkloader.populate(docs[:50])

100%|███████████████████████████████████████| 50/50 [00:00<00:00, 645277.54it/s]
100%|███████████████████████████████████████| 50/50 [00:00<00:00, 348364.12it/s]
100%|███████████████████████████████████████| 50/50 [00:00<00:00, 228946.72it/s]

Documents loaded to DB





In [40]:
my_query = "POSIX standards for threads in C programming language"

embedded_query = embedder.compute_embeddings([my_query])


In [41]:
collection.query(embedded_query.tolist(), n_results=5)

{'ids': [['uri1427',
   'uri2107325473294000610',
   'uri7563616457436198572',
   'uri580',
   'uri1117']],
 'distances': [[0.8717448909668059,
   0.8717448909668059,
   0.891821991243155,
   0.891821991243155,
   0.9114785380490571]],
 'metadatas': [[{'page': 1, 'source': 'library/pthreads.pdf'},
   {'page': 1, 'source': 'library/pthreads.pdf'},
   {'page': 4, 'source': 'library/pthreads.pdf'},
   {'page': 4, 'source': 'library/pthreads.pdf'},
   {'page': 1, 'source': 'library/pthreads.pdf'}]],
 'embeddings': None,
 'documents': [['62 Chapter 4 Threads\nfrom or write to that file descriptor. Because a process and all its threads can be exe-cuting only one program at a time, if any thread inside a process calls one of the \nexec\nfunctions, all the other threads are ended (the new program may, of course, create newthreads).\nGNU/Linux implements the POSIX standard thread API (known as pthreads ).All\nthread functions and data types are declared in the header file \n<pthread.h> .The\npt