In [2]:
import pandas as pd
import numpy as np
from langchain.document_loaders.pdf import PyPDFDirectoryLoader

In [3]:
DATA_PATH = r'C:\QpiAi'

In [4]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


In [5]:
document = load_documents()

In [6]:
len(document)

103

### Making chunks

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [8]:
def split_documents(documents: list[document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [9]:
doc = split_documents(document)

In [10]:
len(doc)

508

In [11]:
stored_meta = pd.read_csv('arxiv_metadata.csv')

## Creating chunk id

In [12]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [13]:
chunk_ids =calculate_chunk_ids(doc)

In [14]:
chunk_ids[:10]

[Document(metadata={'source': 'C:\\QpiAi\\paper_2409.18119.pdf', 'page': 0, 'id': 'C:\\QpiAi\\paper_2409.18119.pdf:0:0'}, page_content='Multi-View and Multi-Scale Alignment for Contrastive\nLanguage-Image Pre-training in Mammography\nYuexi Du1, John Onofrey1,2,3, Nicha C. Dvornek1,2\n1Department of Biomedical Engineering,\n2Department of Radiology & Biomedical Imaging,3Department of Urology,\nYale University, New Haven, CT, USA\nAbstract\nContrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to these\nrestrictions, existing CLIP applications in medical imaging focus mainly on modal-\nities like chest X-rays that have abundant image-report data available, leaving many\nother important modalities under-explored. Here, we propose the first adaptation of\nthe full CLIP model to mammography, which presents significant challenges due to'),
 Document(metadata={'source': 'C:\\QpiAi\\paper_2409.1811

### Integrating with author and publication date

In [15]:
def metadata_adding(doc,stored_meta):
    document = doc
    stored_metadata = stored_meta
    mapped_documents = []
    for doc in document:
        # Extract the paper ID from the source filename
        source_path = doc.metadata['source']
        
        matching_metadata = stored_metadata[stored_metadata['arxiv_id'].astype(str).str.contains(source_path.split('_')[-1].replace('.pdf', ''))]
        # Find the corresponding metadata entry using pandas DataFrame filtering
        
        #print(arxiv_id)
        #print(matching_metadata)


            
        if not matching_metadata.empty:
            # Since matching_metadata is a DataFrame, we can access values by .iloc[0]
            author = matching_metadata.iloc[0]['Author']
            publication_date = matching_metadata.iloc[0]['publication_date']
            
            # Create a new dictionary with merged metadata and split document info
            new_entry = {
                'Author': author,
                'publication_date': publication_date,
                
            }
            mapped_documents.append(new_entry)
    return mapped_documents

In [16]:
mapped_doc = metadata_adding(chunk_ids,stored_meta)

In [59]:
## Create Embeddings

In [17]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [18]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [19]:
texts = [doc.page_content for doc in doc]


In [20]:
global_embeddings = [model.encode(txt) for txt in texts]

In [21]:
len(global_embeddings),len(mapped_doc)

(508, 508)

In [22]:
import chromadb

In [23]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

In [24]:
from chromadb.utils import embedding_functions

In [25]:
emb_func = embedding_functions.SentenceTransformerEmbeddingFunction(model_name= 'all-MiniLM-L6-v2')

In [26]:
db = chromadb.PersistentClient(path= CHROMA_PATH)

In [29]:
db = chromadb.PersistentClient(path= CHROMA_PATH)
collection =db.create_collection('sentence_em', embedding_function= emb_func)

UniqueConstraintError: Collection sentence_em already exists

In [26]:
collection.get(include=[])

{'ids': [],
 'embeddings': None,
 'metadatas': None,
 'documents': None,
 'uris': None,
 'data': None,
 'included': []}

In [27]:
exsisting_items = collection.get(include=[])

In [28]:
exsisting_ids = set(exsisting_items['ids'])

In [29]:
exsisting_ids

set()

In [31]:
collection_name = 'sentence_em'

In [32]:
if collection_name in [col.name for col in db.list_collections()]:
    print(collection_name)


sentence_em


In [37]:
len(doc)

508

In [131]:
old_docs = doc[:500]

In [132]:
new_docs = doc[500:]

In [38]:
doc

[Document(metadata={'source': 'C:\\QpiAi\\paper_2409.18119.pdf', 'page': 0, 'id': 'C:\\QpiAi\\paper_2409.18119.pdf:0:0'}, page_content='Multi-View and Multi-Scale Alignment for Contrastive\nLanguage-Image Pre-training in Mammography\nYuexi Du1, John Onofrey1,2,3, Nicha C. Dvornek1,2\n1Department of Biomedical Engineering,\n2Department of Radiology & Biomedical Imaging,3Department of Urology,\nYale University, New Haven, CT, USA\nAbstract\nContrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to these\nrestrictions, existing CLIP applications in medical imaging focus mainly on modal-\nities like chest X-rays that have abundant image-report data available, leaving many\nother important modalities under-explored. Here, we propose the first adaptation of\nthe full CLIP model to mammography, which presents significant challenges due to'),
 Document(metadata={'source': 'C:\\QpiAi\\paper_2409.1811

In [26]:
def add_to_chroma(chunks: list[document]):
    collection_name = 'sentence_em'
    # Load the existing database.
    db = chromadb.PersistentClient(path= CHROMA_PATH)
    # check for existing collection
    if collection_name in [col.name for col in db.list_collections()]:
        collection = db.get_collection(collection_name)
    else:
        # creates a new collection
        collection =db.create_collection('sentence_em', embedding_function= emb_func)
        print('creating_new_connection')

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = collection.get(include=[])
    existing_ids = set(existing_items['ids'])

    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"üëâ Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        collection.add(documents= texts, ids=new_chunk_ids)
        db.persist()
    else:
        print("‚úÖ No new documents to add")

In [27]:
def delete_all_collections():
    # Load the existing database.
    db = chromadb.PersistentClient(path=CHROMA_PATH)
    
    # List all collections.
    collections = db.list_collections()
    
    # Iterate over each collection and delete it.
    for collection in collections:
        print(f"Deleting collection: {collection.name}")
        db.delete_collection(collection.name)
    
    print("‚úÖ All collections deleted.")

In [28]:
delete_all_collections()

Deleting collection: sentence_em
‚úÖ All collections deleted.


In [29]:
db.list_collections()

[]

In [31]:
def add_to_chroma(chunks: list[document], batch_size=20):
    collection_name = 'sentence_embedi'
    
    # Load the existing database.
    db = chromadb.PersistentClient(path=CHROMA_PATH)
    
    # Check for existing collection.
    if collection_name in [col.name for col in db.list_collections()]:
        collection = db.get_collection(collection_name)
    else:
        # Create a new collection.
        collection = db.create_collection('sentence_embedi', embedding_function=emb_func)
        print('creating_new_connection')
    
    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)
    
    # Add or Update the documents.
    existing_items = collection.get(include=[])
    existing_ids = set(existing_items['ids'])
    
    print(f"Number of existing documents in DB: {len(existing_ids)}")
    
    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
    
    if len(new_chunks):
        print(f"üëâ Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        
        # Split new_chunks into batches to avoid exceeding batch size limit.
        for i in range(0, len(new_chunks), batch_size):
            batch = new_chunks[i:i + batch_size]
            batch_ids = [chunk.metadata["id"] for chunk in batch]
            batch_texts = [chunk.page_content for chunk in batch]
            collection.add(documents=batch_texts, ids=batch_ids)
            print(f"‚úÖ Added batch {i // batch_size + 1} of {len(new_chunks) // batch_size + 1}")
            time.sleep(1)
        
        # Persist the changes to the database.
        db.persist()
    else:
        print("‚úÖ No new documents to add")

In [30]:
import time

In [39]:
def add_to_chroma(chunks: list[document]):
    collection_name = 'sentence_embedi'
    
    # Load the existing database.
    db = chromadb.PersistentClient(path=CHROMA_PATH)
    
    # Check for existing collection.
    if collection_name in [col.name for col in db.list_collections()]:
        collection = db.get_collection(collection_name)
    else:
        # Create a new collection.
        collection = db.create_collection('sentence_em', embedding_function=emb_func)
        print('creating_new_connection')
    
    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)
    
    # Add or Update the documents.
    existing_items = collection.get(include=[])
    existing_ids = set(existing_items['ids'])
    
    print(f"Number of existing documents in DB: {len(existing_ids)}")
    
    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)
    
    if len(new_chunks):
        print(f"üëâ Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        new_chunk_texts = [chunk.page_content for chunk in new_chunks]
        
        # Error handling and logging for the full add operation.
        try:
            collection.add(documents=new_chunk_texts, ids=new_chunk_ids)
            print(f"‚úÖ Successfully added {len(new_chunks)} new documents.")
        except Exception as e:
            print(f"‚ùå Error adding documents: {e}")
        
        # Persist the changes to the database.
        try:
            db.persist()
            print("‚úÖ Database persisted successfully.")
        except Exception as e:
            print(f"‚ùå Error persisting database: {e}")
    else:
        print("‚úÖ No new documents to add")

In [32]:
add_to_chroma(doc)

creating_new_connection
Number of existing documents in DB: 0
üëâ Adding new documents: 508
‚úÖ Added batch 1 of 26
‚úÖ Added batch 2 of 26
‚úÖ Added batch 3 of 26
‚úÖ Added batch 4 of 26


: 

In [1]:
collection.id

NameError: name 'collection' is not defined

In [85]:
client = chromadb.Client()

In [105]:
collection = client.create_collection('Base_embeddings')


In [83]:
add_to_chroma(doc)

NameError: name 'Chroma' is not defined