In [None]:
# Make an environment using relevant requirements.txt to satisfy all of the requirements

In [1]:
# Imports for the code

import json
import chromadb
from chromadb.config import Settings
from chromadb import EmbeddingFunction
from chromadb.utils import embedding_functions

#### This is to create persistent local vector DB

In [2]:
persistence_database_path_mac = "/Users/vasu/Desktop/NLP /project/chatbot_ic_NLP/lib/backend/data/chroma_data"
client = chromadb.Client(settings=Settings(persist_directory=persistence_database_path_mac, allow_reset=True))


### Test Code for creating chroma collections

In [3]:
# Here we are reading our dataset file

# Specify the path to your JSON file
json_file_path = "/Users/vasu/Desktop/NLP /project/chatbot_ic_NLP/lib/backend/data/data_json/2013pubmed.json"

# Open the JSON file and load the data
with open(json_file_path, 'r') as json_file:
    dataset = json.load(json_file)

# Print the loaded dataset
print(len(dataset))

1620


#### Single collection creation

In [4]:
persistence_database_path_windows = "chatbot_ic/lib/backend/data/chroma_data"


## Initialising a sentence transformer for chromadb 
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L12-v2", normalize_embeddings=True)
collection_2013 = client.create_collection("2013pubmed", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})

## adding data to chroma db
collection_2013.add(
    ids=[str(entry['PMID']) for entry in dataset],
    documents=[entry['Title'] + "<SEP>" + entry['Abstract'] for entry in dataset],
    metadatas=[
        {'author': entry['Author']} for entry in dataset
    ],
)

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 493kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 68.5kB/s]
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 16.5MB/s]
config.json: 100%|██████████| 573/573 [00:00<00:00, 329kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 99.4kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 6.85MB/s]
pytorch_model.bin: 100%|██████████| 134M/134M [00:10<00:00, 12.4MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 123kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 111kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.60MB/s]
tokenizer_config.json: 100%|██████████| 352/352 [00:00<00:00, 573kB/s]
train_script.py: 100%|██████████| 13.2k/13.2k [00:00<00:00, 14.0MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 82.4MB/s]
modules.json: 100%|██████████| 349/349 

### Function for creating collections of all the available data

In [None]:
# Function to create multiple collections
persistence_database_path_windows = "G:/All Flutter Applications/NLP with transformers project/chatbot_ic/lib/backend/data/chroma_data"

persistence_database_path_mac = "/Users/vasu/Desktop/NLP /project/chatbot_ic_NLP/lib/backend/data/chroma_data"

client = chromadb.PersistentClient(path=persistence_database_path_windows, settings=Settings(allow_reset=True))

def create_chroma_collections(years):
    
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1", normalize_embeddings=True)
    
    for year in years:
        # Specify the path to your JSON file
        json_file_path = f"G:/All Flutter Applications/NLP with transformers project/chatbot_ic/lib/backend/data/data_json/{year}pubmed.json"

        # Open the JSON file and load the data
        with open(json_file_path, 'r') as json_file:
            dataset = json.load(json_file)
            
        collection = client.create_collection(f"{year}pubmed", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})
        
        collection.add(
            ids=[str(entry['PMID']) for entry in dataset],
            documents=[entry['Title'] + "<SEP>" + entry['Abstract'] for entry in dataset],
            metadatas=[
                {'author': entry['Author']} for entry in dataset
            ],
        )

In [None]:
# local collection creation on persistent db
years = ['2013', '2014', '2015', '2016-2017', '2018', '2019', '2020-1', '2020-2','2021','2022','2023']

create_chroma_collections(years=years)

In [5]:
client.list_collections()

[Collection(name=2013pubmed)]