In [None]:
# Make an environment using relevant requirements.txt to satisfy all of the requirements

In [1]:
# Imports for the code

import json
import chromadb
from chromadb.config import Settings
from chromadb import EmbeddingFunction
from chromadb.utils import embedding_functions

### Test Code for creating chroma collections

In [None]:
# Here we are reading our dataset file

# Specify the path to your JSON file
json_file_path = "G:/All Flutter Applications/NLP with transformers project/chatbot_ic/lib/backend/data/data_json/2013pubmed.json"

# Open the JSON file and load the data
with open(json_file_path, 'r') as json_file:
    dataset = json.load(json_file)

# Print the loaded dataset
print(len(dataset))

In [None]:
persistence_database_path_windows = "chatbot_ic/lib/backend/data/chroma_data"

## Creating client instance of chroma db
# client = chromadb.PersistentClient(path="G:/All Flutter Applications/NLP with transformers project/chatbot_ic/lib/backend/data/chroma_data", se)
client = chromadb.Client(settings=Settings(persist_directory=persistence_database_path_windows, allow_reset=True))

## Initialising a sentence transformer for chromadb 
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1", normalize_embeddings=True)
collection_2013 = client.create_collection("2013pubmed", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})
# Extract data from the dataset and store it in the collection
# collection_2013.add(
#     ids=[str(entry['PMID']) for entry in dataset],
#     documents=[entry['Abstract'] for entry in dataset],
#     metadatas=[
#         {'title': entry['Title'], 'author': entry['Author']} for entry in dataset
#     ],
# )

## adding data to chroma db
collection_2013.add(
    ids=[str(entry['PMID']) for entry in dataset],
    documents=[entry['Title'] + "<SEP>" + entry['Abstract'] for entry in dataset],
    metadatas=[
        {'author': entry['Author']} for entry in dataset
    ],
)

### Function for creating collections of all the available data

In [2]:
# Function to create multiple collections
persistence_database_path_windows = "G:/All Flutter Applications/NLP with transformers project/chatbot_ic/lib/backend/data/chroma_data"
client = chromadb.PersistentClient(path=persistence_database_path_windows, settings=Settings(allow_reset=True))

def create_chroma_collections(years):
    
    sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="multi-qa-MiniLM-L6-cos-v1", normalize_embeddings=True)
    
    for year in years:
        # Specify the path to your JSON file
        json_file_path = f"G:/All Flutter Applications/NLP with transformers project/chatbot_ic/lib/backend/data/data_json/{year}pubmed.json"

        # Open the JSON file and load the data
        with open(json_file_path, 'r') as json_file:
            dataset = json.load(json_file)
            
        collection = client.create_collection(f"{year}pubmed", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})
        
        collection.add(
            ids=[str(entry['PMID']) for entry in dataset],
            documents=[entry['Title'] + "<SEP>" + entry['Abstract'] for entry in dataset],
            metadatas=[
                {'author': entry['Author']} for entry in dataset
            ],
        )

In [3]:
years = ['2013', '2014', '2015', '2016-17']

create_chroma_collections(years=years)

In [4]:
client.list_collections()

[Collection(name=2014pubmed),
 Collection(name=2015pubmed),
 Collection(name=2016-17pubmed),
 Collection(name=2013pubmed)]

In [5]:
# client.delete_collection('2013pubmed')
# client.reset()