In [1]:
#pip install chromadb

In [2]:
import pandas as pd

In [3]:
# Load the IMDb dataset
file_path = r'D:\AI-DATASETS\02-MISC-large\IMDB Dataset.csv'
df = pd.read_csv(file_path)

In [4]:
df.shape

(50000, 2)

In [5]:
df.sample(10)

Unnamed: 0,review,sentiment
29877,Police Story is a stunning series of set piece...,positive
20424,Over the years I've seen a bunch of these stra...,negative
1425,I can name only a few movies that I have seen ...,negative
21346,Ed Wood is eclipsed and becomes Orson Welles. ...,negative
39319,"""on our own"" is a touching story of four kids ...",positive
28636,"The first time I've seen this DVD, I was not o...",positive
35232,29 Sept 1990 marked a small but important mile...,positive
41244,"To say that Thunderbirds is a horrid, forced, ...",negative
42302,<br /><br />I recently viewed this atrocity in...,negative
19425,The movie has one nude scene: A man sitting on...,negative


In [6]:
# Take only the first 1000 reviews
reviews = df['review'].sample(1000).tolist()

#### Generate TF-IDF Vectors
- Use scikit-learn's TfidfVectorizer to generate TF-IDF vectors for the movie reviews.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features for efficiency

In [9]:
# Generate TF-IDF vectors for the reviews
tfidf_matrix = vectorizer.fit_transform(reviews)

In [10]:
tfidf_matrix.shape

(1000, 1000)

#### use chromaDB (vector database)
- store the embeddings

In [12]:
import chromadb

In [13]:
# Initialize ChromaDB client
client = chromadb.Client()

In [16]:
# List all collections
collections = client.list_collections()
print([collection.name for collection in collections])

['imdb_reviews']


In [15]:
# Create a collection to store TF-IDF vectors
collection_name = client.get_or_create_collection("imdb_reviews")

In [17]:
collection_name.count()

0

In [18]:
# Convert TF-IDF matrix to dense array and insert into ChromaDB
tfidf_dense = tfidf_matrix.toarray()

In [19]:
tfidf_dense.shape

(1000, 1000)

In [21]:
# Add each review vector into ChromaDB
for idx, vector in enumerate(tfidf_dense):
    collection_name.add(
        ids       =[str(idx)],                  # Unique ID for each review
        embeddings=[vector],                    # The TF-IDF vector
        metadatas =[{"review": reviews[idx]}],  # Store the actual review
    )

- ids: Unique identifier for each review.
- embeddings: The TF-IDF vectors.
- metadatas: Metadata like the actual review text, which will be retrieved.

#### Querying ChromaDB with TF-IDF
- query ChromaDB to retrieve similar reviews using a TF-IDF-based retriever.

In [23]:
def query_chromadb(query_text, top_k=5):
    # Convert the query to a TF-IDF vector
    query_vector = vectorizer.transform([query_text]).toarray()[0]
    
    # Perform similarity search in ChromaDB
    results = collection_name.query(
        query_embeddings=[query_vector],  # The query vector
        n_results       =top_k  # Number of results to return
    )
    
    return results

In [24]:
# Example query
query_text = "I love movies about space adventures"
result = query_chromadb(query_text)

In [25]:
type(result)

dict

In [26]:
result.keys()

dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents', 'uris', 'data', 'included'])

In [27]:
result['distances']

[[1.619748592376709,
  1.6538150310516357,
  1.743884563446045,
  1.7535734176635742,
  1.7700294256210327]]

In [28]:
for idx, review in enumerate(result['metadatas'][0]):
    print(review['review'])
    print('---')

I saw this film at the Chicago Reeling film festival. To pick up on the previous reviewer's remarks, the claustrophobic feel and off colors of the film is I sense quite intentional and conveys the sense of limited space, drab architecture, overall drabness that constitutes the urban environment of most people in Eastern and Central Europe. A bit shabby housing project style is how I'd describe it, and this is how many people live on the outskirts of larger cities. I can't say that I'm familiar with Bucharest, Romania where the action unfolded, but I have visited and lived in Eastern Europe for six months. <br /><br />When I visited Russia as a student for a semester, my entire group had to drag their luggage seven stories up the staircase of a shabby student dorm building, just as the heroine does when moving in with a woman, because the elevators weren't working. But, I do concur with the reviewer, that the claustrophobia and muted colors, it's overdone, for there are, to be sure, bea