In [None]:
# !pip install chromadb
# !pip install sentence_transformers
# !pip install ipywidgets
# !pip install pandas
# !pip install blingfire

In [2]:
import blingfire
from dotenv import load_dotenv
from pathlib import Path
import chromadb
from chromadb.utils import embedding_functions

In [3]:
# Importing the environment file
dotenv_path = Path('/home/prem/chromadb/.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [23]:
db_path = '/home/prem/chromadb/database_folder'
collection_name = "retail_collection"
embedding_model_name = "all-MiniLM-L6-v2"
sentence_collection_name = "review_sentence_collection"
review_collection_name = "review_collection"
st_collection_name = 'st_collection'

In [5]:
# Run this code only once
try:
    # Creating the Databasse and adding it to the folder
    client = chromadb.PersistentClient(path=db_path)
    print('Persistant Client created')
except:
    print('Persistant Client already exists')

# Create the embedding function to be used
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embedding_model_name)

try:
    sentence_collection = client.create_collection(name=sentence_collection_name,
                                                   embedding_function=sentence_transformer_ef)
    print(f'Collection {sentence_collection_name} created')

except:
    sentence_collection = client.get_collection(name=sentence_collection_name,
                                                embedding_function=sentence_transformer_ef)
    print(f'Collection {sentence_collection_name} initialized')


Persistant Client created
Collection review_sentence_collection initialized


In [None]:
# # Run this on the terminal to make the db available on the server
# chroma run --path /home/prem/chromadb/database_folder

In [7]:
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [8]:
chroma_client.heartbeat() # returns a nanosecond heartbeat. Useful for making sure the client remains connected.
# chroma_client.reset()     # Empties and completely resets the database. ⚠️ This is destructive and not reversible.

1705063208027727142

In [9]:
import pandas as pd
df = pd.read_csv('./data/Sample Reviews.csv')
df

Unnamed: 0,review_id,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,P7880-42549319432-1,42549319432,2,0.0,,0,0,0,2023-03-21,"I really wanted to love this, and I would’ve i...",Has fragrance,,,,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
1,P7880-31124221503-1,31124221503,5,1.0,1.0,1,0,1,2023-03-19,"Makeup remover, gentle cleanser, and all aroun...",Best. Cleanser. Ever.,fair,brown,combination,blonde,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
2,P7880-20246074916-1,20246074916,5,1.0,1.0,2,0,2,2023-03-15,I have been using this for almost 10 years. Lo...,The cleanser I have used for 10 years,,hazel,combination,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
3,P7880-5182718480-1,5182718480,1,0.0,0.0,2,2,0,2023-03-10,I wanted to love this so bad because it felt s...,,mediumTan,brown,combination,brown,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
4,P7880-1840061447-1,1840061447,1,0.0,0.0,1,1,0,2023-03-09,I bought this bc i wanted to have a gentle cle...,Burns and breakouts,,,,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
5,P7880-27905619860-1,27905619860,5,1.0,1.0,2,0,2,2023-03-09,Best facial cleanser I have used! I heard grea...,Hands down best cleanser,light,hazel,combination,brown,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
6,P7880-8523204146-1,8523204146,3,0.0,0.0,1,1,0,2023-03-06,I found this to be just ok - definitely gentle...,,light,hazel,combination,,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
7,P7880-10601781306-1,10601781306,1,0.0,0.0,1,1,0,2023-03-06,The smell is like roses and I don’t feel I get...,Sorry not a love of mine,light,green,dry,gray,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
8,P7880-7079500845-1,7079500845,1,0.0,0.833333,6,1,5,2023-02-28,"IM SO SAD! After 3 days of using this, my face...",,fair,green,combination,brown,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0
9,P7880-2454695916-1,2454695916,5,1.0,1.0,2,0,2,2023-02-26,This is the only nice skincare my boyfriend us...,,fair,hazel,oily,brown,P7880,Soy Hydrating Gentle Face Cleanser,fresh,39.0


In [None]:
# import blingfire
# import os
# model = blingfire.load_model(os.path.join(os.path.dirname(blingfire.__file__), "uri(100k|250k|500k).bin"))
# s = "This is a temporary string. It contains two sentences. But secretly three."
# text = blingfire.text_to_sentences_with_model(model, s)
# print(text)
# blingfire.free_model(model)

: 

In [10]:
sentence_collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [12]:
list_sentences = []
for _, row in df.iterrows():
    sentences = blingfire.text_to_sentences(row['review_text']).split('\n')
    counter = 0
    for sentence in sentences:
        counter += 1
        sentence_collection.add(
                ids=[str(row['review_id'])+'-'+str(counter)],
                metadatas=[{'review_id':row['review_id'], 'rating':row['rating'], 
                            'submission_time':row['submission_time']}],
                documents=[sentence]
            )

In [13]:
sentence_collection.peek()

{'ids': ['P7880-10601781306-1-1',
  'P7880-1840061447-1-1',
  'P7880-1840061447-1-2',
  'P7880-1840061447-1-3',
  'P7880-1840061447-1-4',
  'P7880-20246074916-1-1',
  'P7880-20246074916-1-2',
  'P7880-20246074916-1-3',
  'P7880-20246074916-1-4',
  'P7880-20246074916-1-5'],
 'embeddings': [[-0.020814165472984314,
   -0.09669411182403564,
   0.11827018857002258,
   -0.014699163846671581,
   0.03380495309829712,
   -0.08578701317310333,
   0.0026344878133386374,
   -0.039184123277664185,
   0.12779085338115692,
   -0.04740782827138901,
   -0.055219922214746475,
   -0.03672954440116882,
   -0.05927513167262077,
   0.03273807466030121,
   0.047821201384067535,
   0.06132097542285919,
   0.007855415344238281,
   -0.012193436734378338,
   -0.05096675455570221,
   0.04192560538649559,
   0.03363519161939621,
   0.04696280509233475,
   -0.0068956585600972176,
   -0.05878660827875137,
   -0.07101194560527802,
   0.03663628548383713,
   0.04201676696538925,
   -0.04747510328888893,
   -0.06545069

In [15]:
results = sentence_collection.query(
    query_texts=["happy"],
    n_results=2
)

In [16]:
print(results)

{'ids': [['P7880-7079500845-1-1', 'P7880-1840061447-1-4']], 'distances': [[1.4634152764183157, 1.518877915319404]], 'metadatas': [[{'rating': 1, 'review_id': 'P7880-7079500845-1', 'submission_time': '2023-02-28'}, {'rating': 1, 'review_id': 'P7880-1840061447-1', 'submission_time': '2023-03-09'}]], 'embeddings': None, 'documents': [['IM SO SAD!', 'Glad i didn’t pay full price for this!']], 'uris': None, 'data': None}


In [28]:
results = review_collection.get(
    ids=['P7880-10601781306-1', 'P7880-5182718480-1'],
    include=['embeddings']
)

In [29]:
results

{'ids': ['P7880-10601781306-1', 'P7880-5182718480-1'],
 'embeddings': [[-0.020814165472984314,
   -0.09669411182403564,
   0.11827018857002258,
   -0.014699163846671581,
   0.03380495309829712,
   -0.08578701317310333,
   0.0026344878133386374,
   -0.039184123277664185,
   0.12779085338115692,
   -0.04740782827138901,
   -0.055219922214746475,
   -0.03672954440116882,
   -0.05927513167262077,
   0.03273807466030121,
   0.047821201384067535,
   0.06132097542285919,
   0.007855415344238281,
   -0.012193436734378338,
   -0.05096675455570221,
   0.04192560538649559,
   0.03363519161939621,
   0.04696280509233475,
   -0.0068956585600972176,
   -0.05878660827875137,
   -0.07101194560527802,
   0.03663628548383713,
   0.04201676696538925,
   -0.04747510328888893,
   -0.06545069813728333,
   -0.07387713342905045,
   0.10716726630926132,
   0.06909947097301483,
   -0.012769496068358421,
   -0.04970699921250343,
   -0.03548924997448921,
   0.0960300862789154,
   -0.05150236561894417,
   -0.03394

In [45]:
from chromadb import Documents, EmbeddingFunction, Embeddings
from transformers import AutoTokenizer


class MyEmbeddingFunction(EmbeddingFunction):
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')


    def __call__(self, input: Documents) -> Embeddings:
        # embed the documents somehow
        embeddings = self.tokenizer(input)
        return embeddings['input_ids']

In [46]:
# Create the embedding function to be used
st_ef = MyEmbeddingFunction()

try:
    st_collection = client.create_collection(name=st_collection_name,
                                             embedding_function=st_ef)
    print(f'Collection {st_collection_name} created')

except:
    st_collection = client.get_collection(name=st_collection_name,
                                          embedding_function=st_ef)
    print(f'Collection {st_collection_name} initialized')



Collection st_collection initialized


In [50]:
sent = 'This is a test sentence.'
sent2 = 'This is the second test sentence'
sent3 = 'This is the third test sentence'

st_collection.add(
    ids=['1-2','1-3'],
    metadatas=[{'review_id':'1', 'rating':5, 'submission_time':'2023-10-23'}, {'review_id':'1', 'rating':5, 'submission_time':'2023-10-23'}],
    documents=[sent2, sent3]
)

In [51]:
st_collection.peek()

{'ids': ['1-1', '1-2', '1-3'],
 'embeddings': [[101.0, 1188.0, 1110.0, 170.0, 2774.0, 5650.0, 119.0, 102.0],
  [101.0, 1188.0, 1110.0, 1103.0, 1248.0, 2774.0, 5650.0, 102.0],
  [101.0, 1188.0, 1110.0, 1103.0, 1503.0, 2774.0, 5650.0, 102.0]],
 'metadatas': [{'rating': 5,
   'review_id': '1',
   'submission_time': '2023-10-23'},
  {'rating': 5, 'review_id': '1', 'submission_time': '2023-10-23'},
  {'rating': 5, 'review_id': '1', 'submission_time': '2023-10-23'}],
 'documents': ['This is a test sentence.',
  'This is the second test sentence',
  'This is the third test sentence'],
 'uris': None,
 'data': None}

In [43]:
Docs = ['This is a sample sentence.','This is a second sentence.']
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
embeddingss = tokenizer(Docs)
print(embeddingss['input_ids'])

[[101, 1188, 1110, 170, 6876, 5650, 119, 102], [101, 1188, 1110, 170, 1248, 5650, 119, 102]]


In [44]:
my_tokenizer = MyEmbeddingFunction()
my_tokenizer('This is a sample ssentence')

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).