In [43]:
# !pip install chromadb
# !pip install sentence_transformers
# !pip install ipywidgets
# !pip install pandas
# !pip install blingfire

In [44]:
import blingfire
from dotenv import load_dotenv
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions
import torch

In [45]:
# Importing the environment file
dotenv_path = Path('/home/prem/chromadb/.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [46]:
# Variable names
db_path = '/home/prem/chromadb/database_folder'
st_collection_name = 'st_collection'

In [157]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from transformers import AutoTokenizer, AutoModel

"""
Custom Embedding Function
Initialize this with any model that that you have created. 
Here, I have taken the example of the tokenizer for 'bert-base-cased'. 
This could be loaded from any folder that you have stored your  tokenizer in.
"""
class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        self.model = AutoModel.from_pretrained('bert-base-cased')


    def __call__(self, input: Documents) -> Embeddings:
        # Embedding the documents
        list_emb = []

        for doc in input:
            tokens = self.tokenizer(doc,
                                    padding='max_length',
                                    return_tensors='pt')
            output = self.model(**tokens)
            embeddins = output['last_hidden_state'][0].detach().flatten().tolist()
            list_emb.append(embeddins)
        return list_emb
    
    
# Initializing my custom embedding function
st_ef = MyEmbeddingFunction()

In [161]:
# Testing the embedding function
embeds = MyEmbeddingFunction()
embedding = embeds(['This is a sample sentence.', 'This is a second sentence'])
print('Type:', type(embedding))
print('Length:', len(embedding))
print('Type:', type(embedding[0]))
print('Length:', len(embedding[0]))
print('Type:', type(embedding[0][0]))

Type: <class 'list'>
Length: 2
Type: <class 'list'>
Length: 393216
Type: <class 'float'>


In [162]:
# # Run this on the terminal to make the db available on the server
# chroma run --path /home/prem/chromadb/database_folder

In [163]:
# Create a client to connect to the DB
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

# Check the client connection
print('Heartbeat:',chroma_client.heartbeat()) # returns a nanosecond heartbeat. Useful for making sure the client remains connected.

Heartbeat: 1705343766135945882


In [201]:
# Create or get the collection
try:
    st_collection = chroma_client.get_or_create_collection(name=st_collection_name,
                                                           embedding_function=st_ef,
                                                           metadata={"hnsw:space": "cosine"})
    print(f"Collection '{st_collection_name}' initialized")
except:
    print('Unable to create or get the collection.')

# Printing the total number of records in the collection
print(f'Total number of records: {st_collection.count()}')

Collection 'st_collection' initialized
Total number of records: 10


In [165]:
# Take a look at a sample of the existing data. (10)
st_collection.peek()

{'ids': ['P7880-10601781306-1-1',
  'P7880-1840061447-1-1',
  'P7880-20246074916-1-1',
  'P7880-2454695916-1-1',
  'P7880-27905619860-1-1',
  'P7880-31124221503-1-1',
  'P7880-42549319432-1-1',
  'P7880-5182718480-1-1',
  'P7880-7079500845-1-1',
  'P7880-8523204146-1-1'],
 'embeddings': [[0.42605483531951904,
   0.14331039786338806,
   0.11717253923416138,
   -0.05475122109055519,
   -0.06861328333616257,
   -0.024040888994932175,
   0.2895282506942749,
   -0.010999645106494427,
   -0.20582014322280884,
   -1.0716136693954468,
   -0.2774657905101776,
   -0.20955708622932434,
   -0.4061518609523773,
   -0.198666051030159,
   -0.25736284255981445,
   -0.16296939551830292,
   0.1941295862197876,
   -0.16959133744239807,
   -0.10016337037086487,
   -0.15254901349544525,
   0.04407411068677902,
   0.11409974843263626,
   0.5391536355018616,
   -0.21139928698539734,
   0.1791088581085205,
   -0.011043732985854149,
   0.30440664291381836,
   0.2830239236354828,
   -0.01332050934433937,
   0.0

In [202]:
# Deleting the collection
chroma_client.delete_collection('st_collection')

# Then recreate it using the same code
try:
    st_collection = chroma_client.get_or_create_collection(name=st_collection_name,
                                                           embedding_function=st_ef,
                                                           metadata={"hnsw:space": "cosine"})
    print(f"Collection '{st_collection_name}' initialized")
except:
    print('Unable to create or get the collection.')

# Printing the total number of records in the collection
print(f'Total number of records: {st_collection.count()}')

Collection 'st_collection' initialized
Total number of records: 0


In [204]:
# Another way to empty the collection but without deleting it.
if st_collection.count() > 0:
    # Getting the list of IDs so that we can clear the collection without deleting it
    result = st_collection.get()
    print(result['ids'])

    # Remove these records from the collection.
    st_collection.delete(ids=result['ids'])

    # Confirm deletion by printing the number of records
    print(f'Total number of records: {st_collection.count()}')

else:
    print('Empty collection')

Empty collection


In [205]:
# Reading the data that needs to be read into the DB
import pandas as pd
df = pd.read_csv('./data/Sample Reviews.csv')
df.head()

Unnamed: 0,review_id,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,P7880-42549319432-1,42549319432,2,0.0,,0,0,0,2023-03-21,"I really wanted to love this, and I would’ve i...",Has fragrance,,,,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
1,P7880-31124221503-1,31124221503,5,1.0,1.0,1,0,1,2023-03-19,"Makeup remover, gentle cleanser, and all aroun...",Best. Cleanser. Ever.,fair,brown,combination,blonde,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
2,P7880-20246074916-1,20246074916,5,1.0,1.0,2,0,2,2023-03-15,I have been using this for almost 10 years. Lo...,The cleanser I have used for 10 years,,hazel,combination,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
3,P7880-5182718480-1,5182718480,1,0.0,0.0,2,2,0,2023-03-10,I wanted to love this so bad because it felt s...,,mediumTan,brown,combination,brown,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
4,P7880-1840061447-1,1840061447,1,0.0,0.0,1,1,0,2023-03-09,I bought this bc i wanted to have a gentle cle...,Burns and breakouts,,,,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0


In [206]:
"""
Do the following for each of the reviews:
    1. Split the review into sentences
    2. For each of the sentences, add the sentence along with some meta data to the collection
"""
for _, row in df.iterrows():
    # Using BlingFire's standard method of splitting text into sentences
    sentences = blingfire.text_to_sentences(row['review_text']).split('\n')

    for counter, sentence in enumerate(sentences):
        st_collection.add(
                ids=[str(row['review_id'])+'-'+str(counter+1)],
                metadatas=[{'review_id':row['review_id'], 'rating':row['rating'], 
                            'submission_time':row['submission_time']}],
                documents=[sentence]
            )
        break
        
print(f'Total number of records added to the collection: {st_collection.count()}')

Total number of records added to the collection: 10


In [207]:
st_collection.peek(1)

{'ids': ['P7880-10601781306-1-1'],
 'embeddings': [[0.42605483531951904,
   0.14331039786338806,
   0.11717253923416138,
   -0.05475122109055519,
   -0.06861328333616257,
   -0.024040888994932175,
   0.2895282506942749,
   -0.010999645106494427,
   -0.20582014322280884,
   -1.0716136693954468,
   -0.2774657905101776,
   -0.20955708622932434,
   -0.4061518609523773,
   -0.198666051030159,
   -0.25736284255981445,
   -0.16296939551830292,
   0.1941295862197876,
   -0.16959133744239807,
   -0.10016337037086487,
   -0.15254901349544525,
   0.04407411068677902,
   0.11409974843263626,
   0.5391536355018616,
   -0.21139928698539734,
   0.1791088581085205,
   -0.011043732985854149,
   0.30440664291381836,
   0.2830239236354828,
   -0.01332050934433937,
   0.07765422016382217,
   0.07963591068983078,
   0.4298568069934845,
   -0.1498190015554428,
   -0.01737874373793602,
   -0.20657546818256378,
   0.18262672424316406,
   -0.11806201934814453,
   -0.44327816367149353,
   0.10061953216791153,
 

In [208]:
## This code is to check the method of using a model for the splitting of the text into sentences

# import blingfire
# import os
# model = blingfire.load_model(os.path.join(os.path.dirname(blingfire.__file__), "uri(100k|250k|500k).bin"))
# s = "This is a temporary string. It contains two sentences. But secretly three."
# text = blingfire.text_to_sentences_with_model(model, s)
# print(text)
# blingfire.free_model(model)

In [220]:
# Querying the data
list_input_text = ['smells like roses']
results = st_collection.query(
    query_texts=list_input_text,
    n_results=1
)

print(results)

{'ids': [['P7880-10601781306-1-1']], 'distances': [[0.2381109603230379]], 'embeddings': None, 'metadatas': [[{'rating': 1, 'review_id': 'P7880-10601781306-1', 'submission_time': '2023-03-06'}]], 'documents': [['The smell is like roses and I don’t feel I get a deep down clean feel']], 'uris': None, 'data': None}
