In [20]:
import csv

def read_csv_rows(csv_file):
    """
    Opens a CSV file and iteratively reads each row.

    Args:
    - csv_file (str): The path to the CSV file.

    Yields:
    - list: Each row of the CSV file as a list.
    """
    with open(csv_file, 'r') as file:
        csv_reader = csv.reader(file)
        # skips field column rows
        next(csv_reader)
        for row in csv_reader:
            yield row
           

# Example usage:
file_path = './../data/pmc_1mil.csv'  # Replace 'example.csv' with the path to your CSV file

count = 0
for row in read_csv_rows(file_path):
    print(row)
    count += 1
    if count == 1:
        break    




['PMC176547', "==== FrontPLoS BiolPLoS BiolpbioplosbiolPLoS Biology1544-91731545-7885Public Library of Science San Francisco, USA 10.1371/journal.pbio.0000007SynopsisEcologyEvolutionGenetics/Genomics/Gene TherapyZoologyMammalsBorneo Elephants: A High Priority for Conservation Synopsis10 2003 18 8 2003 18 8 2003 1 1 e7Copyright: © 2003 Public Library of Science.2003This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are properly credited.DNA Analysis Indicates That Asian Elephants Are Native to Borneo and Are Therefore a High Priority for Conservation==== BodyA new study settles a long-standing dispute about the genesis of an endangered species. With scant fossil evidence supporting a prehistoric presence, scientists could not say for sure where Borneo's elephants came from. Did they descend from ancient prototypes of t

In [22]:
import chromadb
client = chromadb.PersistentClient(path="./chromadb")
client.delete_collection("test_collection")

collection = client.create_collection("pmc_1mil")

ValueError: Collection test_collection does not exist.

In [2]:
token_limit = 20

from transformers import BartConfig, BartForConditionalGeneration, BartTokenizer

model_name = "alinet/bart-base-balanced-qg"

tokenizer = BartTokenizer.from_pretrained(model_name)

doc = "The sun set behind the mountains, casting a golden glow over the serene lake. Birds chirped softly as the day faded into twilight."

def trim_document_to_token_limit(document, token_limit, tokenizer):
  sentences = document.split(".")
  print(sentences)

  while len(tokenizer.tokenize(document)) > token_limit:
    sentences.pop()


len(tokenizer.tokenize("The sun set behind the mountains, casting a golden glow over the serene lake. Birds chirped softly as the day faded into twilight."))


29

In [19]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def len_tokenize(string):
  return len(tokenizer.tokenize(string))


doc = "The sun set behind the mountains, casting a golden glow over the serene lake. Birds chirped softly as the day faded into twilight."
doc1 = "The sun set behind the mountains "


text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=20,
    chunk_overlap=0,
    length_function=len_tokenize,
    is_separator_regex=False,
)

documents = text_splitter.create_documents([doc])

for doc in documents:
  print(doc.page_content)
  print(len(tokenizer.tokenize(doc.page_content)))






The sun set behind the mountains, casting a golden glow over the serene lake. Birds chirped softly as the day faded into twilight.
The sun set behind the mountains 
The sun set behind the mountains, casting a golden glow over the serene lake. Birds
18
chirped softly as the day faded into twilight.
11


Actual script for generate_embedding_pmc_1mil.py

In [5]:
# Imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import BartTokenizer
import chromadb
from angle_emb import AnglE
from datasets import load_dataset
import time

# Initialisations 
# Tokenizers - model_name needs to be passed as argument to script
model_name = "alinet/bart-base-balanced-qg"
tokenizer = BartTokenizer.from_pretrained(model_name)

# Utility functions
def len_tokenize(string):
  return len(tokenizer.tokenize(string))

def remove_until_last_fullstop(input_string):
    # Find the index of the last full stop
    last_fullstop_index = input_string.rfind('.')
    
    # If no full stop is found, return the original string
    if last_fullstop_index == -1:
        return input_string
    
    # Return the substring from the beginning of the string to the last full stop
    return input_string[:last_fullstop_index + 1]

# Text splitter
token_limit = 512
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=token_limit,
    chunk_overlap=0,
    length_function=len_tokenize,
    is_separator_regex=False,
)

# ChromaDB client and collection
client = chromadb.PersistentClient(path="./chromadb")
collection = client.get_or_create_collection(name="pmc_1mil")

# Embedding Model
angle = AnglE.from_pretrained('WhereIsAI/UAE-Large-V1', pooling_strategy='cls').cuda()


INFO:backoff:Backing off send_request(...) for 0.6s (requests.exceptions.ReadTimeout: HTTPSConnectionPool(host='us-api.i.posthog.com', port=443): Read timed out. (read timeout=15))


In [6]:
pmc = (
  load_dataset("pmc/open_access", split="train", streaming=True, trust_remote_code=True)
)
pmc_iter = iter(pmc)

for idx, row in enumerate(pmc_iter):
  if idx == 10:
    break

  #remove this line from we load_dataset pmc_1mil
  article = row['text'].replace('\n', '')

  start = time.time()
  documents = text_splitter.create_documents([article])

  chunk_id = ""
  for id, doc in enumerate(documents):
    filtered_page_content = remove_until_last_fullstop(doc.page_content)
    embedding = angle.encode(filtered_page_content, to_numpy=True)

    chunk_id = row['accession_id'] + "C" + str(id)
    collection.add(
      embeddings=embedding,
      documents=filtered_page_content,
      ids=chunk_id, 
    )
  end = time.time()
  print(chunk_id)
  print(end-start)
  


PMC176545C40
11.48909330368042
PMC176546C14
3.6310009956359863
PMC176547C1
0.49527502059936523
PMC176548C1
0.5955550670623779
PMC193604C20
6.057276964187622
PMC193605C23
7.762468099594116
PMC193606C1
0.6144440174102783
PMC193607C1
0.5955069065093994
PMC212319C20
5.7224040031433105
PMC212687C30
8.643115043640137
