In [1]:
#pip install chromadb

In [1]:
import pandas as pd

In [2]:
# Load the IMDb dataset
file_path = r'D:\AI-DATASETS\02-MISC-large\IMDB Dataset.csv'
df = pd.read_csv(file_path)

In [3]:
df.shape

(50000, 2)

In [4]:
df.sample(10)

Unnamed: 0,review,sentiment
1991,I have seen so many bad reviews on Supervivien...,positive
15070,A group of people are invited to there high sc...,negative
32002,For anyone who has seen and fallen in love wit...,negative
29739,I wasn't born until 4 years after this wonderf...,positive
19638,"Well, not yet, at least.<br /><br />It's not l...",negative
27769,It is so nice to see Bruce Willis come down of...,positive
8374,The comments for Commune make it sound like a ...,negative
32879,It is true that some fans of Peter Sellers wor...,negative
11482,Christophe Lambert once said he was still maki...,negative
8009,The thing that's truly terrifying about this i...,negative


In [5]:
# Take only the first 1000 reviews
reviews = df['review'].sample(1000).tolist()

#### Generate TF-IDF Vectors
- Use scikit-learn's TfidfVectorizer to generate TF-IDF vectors for the movie reviews.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features for efficiency

In [9]:
# Generate TF-IDF vectors for the reviews
tfidf_matrix = vectorizer.fit_transform(reviews)

In [10]:
tfidf_matrix.shape

(1000, 1000)

#### use chromaDB (vector database)
- store the embeddings

In [12]:
import chromadb

In [13]:
# Initialize ChromaDB client
client = chromadb.Client()

In [14]:
# List all collections
collections = client.list_collections()
print([collection.name for collection in collections])

[]


In [15]:
# Create a collection to store TF-IDF vectors
collection_name = client.get_or_create_collection("imdb_reviews")

In [16]:
collection_name.count()

0

In [17]:
# Convert TF-IDF matrix to dense array and insert into ChromaDB
tfidf_dense = tfidf_matrix.toarray()

In [18]:
tfidf_dense.shape

(1000, 1000)

In [19]:
# Add each review vector into ChromaDB
for idx, vector in enumerate(tfidf_dense):
    collection_name.add(
        ids       =[str(idx)],                  # Unique ID for each review
        embeddings=[vector],                    # The TF-IDF vector
        metadatas =[{"review": reviews[idx]}],  # Store the actual review
    )

- ids: Unique identifier for each review.
- embeddings: The TF-IDF vectors.
- metadatas: Metadata like the actual review text, which will be retrieved.

#### Querying ChromaDB with TF-IDF
- query ChromaDB to retrieve similar reviews using a TF-IDF-based retriever.

In [20]:
def query_chromadb(query_text, top_k=5):
    # Convert the query to a TF-IDF vector
    query_vector = vectorizer.transform([query_text]).toarray()[0]
    
    # Perform similarity search in ChromaDB
    results = collection_name.query(
        query_embeddings=[query_vector],  # The query vector
        n_results       =top_k  # Number of results to return
    )
    
    return results

In [21]:
# Example query
query_text = "I love movies about space adventures"
result = query_chromadb(query_text)

In [22]:
type(result)

dict

In [23]:
result.keys()

dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents', 'uris', 'data', 'included'])

In [24]:
result['distances']             # L2 based norm

[[1.7007890939712524,
  1.7408870458602905,
  1.7800662517547607,
  1.784860372543335,
  1.792397141456604]]

In [25]:
for idx, review in enumerate(result['metadatas'][0]):
    print(review['review'])
    print('---')

First of all, even IMDb is slacking with this movie, as the list of cast is VERY "gappy". Even main characters are missing from it like Buddy for example.<br /><br />Now back to the movie. I love watching movies, but this one tortured me throughout the whole 82 or however many minutes. It was HORRID. Probably the worst movie I have ever seen. And the reason why it bothers me so much, is because I was quite excited about seeing it beforehand.<br /><br />The plot line itself is good. It could have been a great film if done properly and with careful casting. Golden Brooks was a HUGE disappointment. I now see that the only role she can be good in is the loud, fun-loving, energetic sexy chick she plays on Girlfriends. Melodramatic roles are not for her at all. She basically killed her character, Rachel.<br /><br />I love some of the cast, like Deborah Cox, Mel Jackson and Darrin Dewitt Henson, but even they couldn't save this catastrophic movie. Of course it is only my personal opinion that

#### books.csv

In [26]:
# Load the books csv
# https://www.kaggle.com/datasets/saurabhbagchi/books-dataset/data
file_path = r'D:\AI-DATASETS\02-MISC-large\books.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1', sep=';', on_bad_lines='skip', low_memory=False)

In [27]:
df.shape

(271360, 8)

In [28]:
df.drop(['Image-URL-S', 'Image-URL-M',	'Image-URL-L'], axis=1, inplace=True)

In [32]:
df.sample(10)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
105537,0451188144,Dr. Nightingale Meets Puss in Boots: A Deirdre...,Lydia Adamson,1997,Signet Book
116568,0380704846,Rosemoore,Arthur Cavanaugh,1989,Harper Mass Market Paperbacks (Mm)
205993,080501196X,"Oh, Fudge: A Celebration of America's Favorite...",Lee Edwards Benning,1990,Henry Holt &amp; Co
237702,0440404150,The Princess of the Fillmore Street School,Marjorie Weinman Sharmat,1991,Yearling Books
259136,0451178696,The Willful Widow (Signet Regency Romance),Evelyn Richardson,1994,New Amer Library (Mm)
110829,0843925035,Promises to Keep,Wendy Susans,1987,Leisure Books
35800,0751526568,Smithy,Ian Mackersey,2000,Trafalgar Square Publishing
178023,0671874888,"Goodbye, Janette",Harold Robbins,1997,Pocket Books
25021,0671526812,HIGH STAKES HR PRE (Harold Robbins Presents Se...,John Fischer,1986,Pocket
124637,0373095538,The Welcoming (Silhouette Special Edition No. ...,Nora Roberts,1989,Silhouette


In [36]:
import string,re
#import spacy
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [37]:
def preprocess_text(text):
    
    text = text.lower()  # Lowercasing
    
    # Remove all punctuation except '&'
    text = text.translate(str.maketrans('', '', string.punctuation.replace('&', '')))
    
    text = text.strip()  # Remove leading/trailing whitespace
    text = re.sub(r'\s+', ' ', text)  # Remove excessive whitespace using regex
    #print(f"Without excessive whitespace: {text}")
    
    # Normalize &amp; if it exists
    text = re.sub(r'&amp;', 'and', text)
    #print(f"After replacing '&amp;': {text}")
    
    # Replace any remaining & with 'and'
    text = text.replace('&', 'and')
    #print(f"After replacing '&': {text}")
    
    return text

In [38]:
# Example usage
sample_text = "This is an example   with   excessive  & whitespace!"
cleaned_text = preprocess_text(sample_text)
print(cleaned_text)

this is an example with excessive and whitespace


In [39]:
# Drop rows with any null values
df_cleaned = df.dropna()

In [40]:
%%time
# Apply the preprocessing
df_cleaned['text'] = df_cleaned['Book-Title'] + ' ' + df_cleaned['Book-Author'] + ' ' + df_cleaned['Publisher']
df_cleaned['text'] = df_cleaned['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: total: 2.42 s
Wall time: 4.23 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
df_cleaned_samples = df_cleaned.sample(2500)

In [42]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=10000)

In [43]:
# Generate TF-IDF vectors for the reviews
tfidf_matrix = vectorizer.fit_transform(df_cleaned_samples.text)

In [44]:
# Initialize ChromaDB client
client = chromadb.Client()

In [45]:
# List all collections
collections = client.list_collections()
print([collection.name for collection in collections])

['imdb_reviews']


In [46]:
# Convert TF-IDF matrix to dense array and insert into ChromaDB
tfidf_dense = tfidf_matrix.toarray()

In [47]:
%%time

# Store the original indices before any resetting
original_indices = df_cleaned_samples.index.tolist()  # Store original indices

# Add each review vector into ChromaDB
for idx, vector in enumerate(tfidf_dense):

    # Use original_indices to fetch metadata
    original_idx = original_indices[idx]
    
    # Construct metadata with book information
    metadata = {
        "Title": df_cleaned_samples.loc[original_idx, 'Book-Title'],          # Book Title
        "Author": df_cleaned_samples.loc[original_idx, 'Book-Author'],        # Book Author
        "Year": df_cleaned_samples.loc[original_idx, 'Year-Of-Publication'],  # Year of Publication
        "Publisher": df_cleaned_samples.loc[original_idx, 'Publisher'],       # Publisher
    }
    
    # Add the vector and metadata to ChromaDB collection
    collection_name.add(
        ids=[str(idx)],                  # Unique ID for each book/review
        embeddings=[vector],             # The TF-IDF vector
        metadatas=[metadata]             # Store book details (metadata)
    )

InvalidDimensionException: Embedding dimension 8717 does not match collection dimensionality 1000

In [48]:
def query_chromadb(query_text, top_k=3):
    # Convert the query to a TF-IDF vector
    query_vector = vectorizer.transform([query_text]).toarray()[0]
    
    # Perform similarity search in ChromaDB
    results = collection_name.query(
        query_embeddings=[query_vector],  # The query vector
        n_results       =top_k            # Number of results to return
    )
    
    return results

#### BM25

BM25 (Best Matching 25) is a ranking function used in information retrieval systems to evaluate the relevance of documents in relation to a query. It builds upon the probabilistic information retrieval model and is particularly effective for scoring and ranking documents based on the frequency of query terms within them. Here's a breakdown of BM25 and its differences from TF-IDF:

**Relevance Scoring:** BM25 calculates a score for each document based on the presence and frequency of the terms in the query. The score reflects how well the document matches the query.

**Components:**

- **Term Frequency (TF):** Similar to TF-IDF, BM25 considers how often a term appears in a document. However, it uses a saturation function to diminish the effect of term frequency as it increases.
  
- **Inverse Document Frequency (IDF):** BM25 employs IDF to account for the rarity of terms. Rare terms contribute more to the score than common terms.
  
- **Document Length Normalization:** BM25 normalizes for document length, ensuring that longer documents do not have an unfair advantage simply because they contain more terms.
  
- **Parameters:** BM25 has parameters like `k1` (controls the impact of term frequency) and `b` (adjusts the normalization based on document length). This flexibility allows for tuning based on specific datasets.

##### Why We Need to Saturate the Term Frequency (TF)

##### 1. Diminishing Returns on Term Relevance
- As the frequency of a term in a document increases, its contribution to relevance does not increase linearly. In fact, the impact of additional occurrences of the term diminishes.
- For instance, if a document mentions a keyword 1 time, it may be relevant; if it mentions it 10 times, it doesn't necessarily mean it is 10 times more relevant. Saturation accounts for this diminishing returns effect.

##### 2. Avoiding Overemphasis on Frequent Terms
- Without saturation, documents with very high term frequencies might be unfairly prioritized, even if the actual content and relevance to the query are low.
- This is particularly important for documents that may repeat a term excessively, as they could skew the results and lead to poor search quality.

##### 3. Enhancing Precision in Ranking
- Saturation helps create a more balanced scoring system. It allows for a better distinction between documents with varying term frequencies.
- For example, a document with a term frequency of 5 might be seen as more relevant than one with a frequency of 1, but not overwhelmingly so. The use of saturation can help refine the score, ensuring the ranking is more precise and meaningful.

##### 4. Consistency Across Document Lengths
- Different documents can vary significantly in length, leading to variations in raw term frequencies. Saturation helps normalize these differences.
- This is particularly important for long documents where high term frequency might be a result of sheer length rather than actual relevance.

##### 5. Parameter Control
- Saturation provides a means to control the behavior of the scoring function through parameters like `k1`. By adjusting this parameter, users can fine-tune how quickly the effects of term frequency diminish, allowing for flexibility based on specific datasets and requirements.

##### Example of Term Frequency Saturation
Consider a term "machine learning" in two documents:

- **Document A:** "Machine learning is a fascinating field. Machine learning can change the world."
- **Document B:** "Machine learning is machine learning machine learning machine learning machine learning machine learning."

- **Without Saturation:** Document B might score much higher due to the raw count of term occurrences (5 times).
- **With Saturation:** Document A might still score higher or similarly due to its context and meaningful use of the term, even though it appears less frequently.


#### 2. Inverse Document Frequency (IDF)

Inverse Document Frequency (IDF) in BM25 is conceptually similar to IDF in TF-IDF, but there are some differences in how they are calculated and used in their respective formulas. Here’s a breakdown of the similarities and differences:

`Similarities`

`Purpose`: Both IDF measures are designed to reflect the importance of a term across a collection of documents. The primary goal is to reduce the weight of common terms and increase the weight of rare terms in the scoring process.

`Concept`: In both cases, IDF is based on the idea that terms that appear in many documents are less informative than terms that appear in fewer documents. Therefore, IDF contributes to emphasizing the significance of rarer terms.

`Differences`

1. Mathematical Formulation:
   
- `TF-IDF IDF`:

$$
\operatorname{IDF}(t)=\log \left(\frac{N}{\operatorname{df}(t)}\right)
$$


Where $N$ is the total number of documents and $\operatorname{df}(t)$ is the number of documents containing the term $t$.

- `BM25 IDF`:

$$
\operatorname{IDF}(t)=\log \left(\frac{N-\operatorname{df}(t)+0.5}{\operatorname{df}(t)+0.5}\right)
$$


In BM25, a smoothing factor (0.5) is added to both the numerator and denominator to prevent division by zero and to smooth the effect of terms that appear in very few documents.

2. Normalization:
   
- BM25 applies a more nuanced form of normalization, which makes the IDF component more robust in cases where terms are either very common or very rare. The added constants help avoid extreme values, making the model more stable.

The IDF in BM25 is calculated as the difference between the total number of documents \( N \) and the document frequency. This formulation introduces a focus on both the `rarity` of the term and its `relative presence` in the collection.

##### Intuition:

###### 1. Balance of Commonality and Rarity:
By using ( N - $\text{df}(t) $), BM25 considers how many documents do not contain the term, enhancing the importance of rare terms even further. If a term appears in very few documents, this number is high, which leads to a higher IDF score.

###### 2. Proportional Contribution:
The division by df(t) ensures that the IDF is normalized against the term's frequency. A term present in a few documents but still common in those documents will receive a lower weight than a term that is both rare and non-trivial in its presence.

###### 3. Saturation Effect:
This formulation captures the idea of term frequency saturation better, where the IDF score accounts for diminishing returns as the document frequency increases. As more documents contain the term, its ability to distinguish documents decreases.


#### 3. Document Length Normalization in BM25

BM25 incorporates document length normalization to ensure that longer documents do not have an unfair advantage in the scoring process simply because they contain more terms. 

`How Normalization is Achieved:`
1. **Length Parameters**: BM25 uses a parameter \( b \) (typically set between 0 and 1) to control the degree of normalization. A value of \( b = 1 \) applies full normalization, while \( b = 0 \) means no normalization.
2. **Length Calculation**: The document length is measured in terms of the total number of terms. BM25 compares this length against the average document length in the collection.
3. **Score Adjustment**: The normalization is applied during the score calculation. It adjusts the term frequency based on the length of the document relative to the average length, reducing the score for longer documents while increasing it for shorter ones.

#### Example:
- **Document A** (100 words) contains the term "AI" 10 times.
- **Document B** (200 words) also contains the term "AI" 20 times.

Without normalization, Document B would have a higher score due to higher term frequency. However, BM25 adjusts for this by taking into account the document lengths, ensuring that Document A's relevance is appropriately recognized despite its shorter length.


#### BM25 Formula
The BM25 scoring function can be represented as follows:

$$
\operatorname{BM} 25(d, q)=\sum_{i=1}^{|q|} I D F\left(t_i\right) \cdot \frac{T F\left(t_i, d\right) \cdot\left(k_1+1\right)}{T F\left(t_i, d\right)+k_1 \cdot\left(1-b+b \cdot \frac{|d|}{\text { avgdl }}\right)}
$$


Where:
- $d=$ document
- $q=$ query
- $t_i=$ term in the query
- $T F\left(t_i, d\right)=$ term frequency of $t_i$ in document $d$
- $|d|=$ length of the document (number of terms)
- $\operatorname{avgdl}=$ average document length across the corpus
- $I D F\left(t_i\right)=$ inverse document frequency of term $t_i$
- $k_1$ and $b=$ tuning parameters

In [49]:
#pip install rank-bm25

In [50]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

In [51]:
# Sample documents (preprocessed as tokenized lists of words)
corpus = [
    "The cat in the hat",
    "The quick brown fox",
    "The lazy dog sleeps",
    "Fox is a wild animal"
]

In [52]:
# Tokenize each document in the corpus
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

In [53]:
# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

In [54]:
# Query
query = word_tokenize("fox wild".lower())

In [55]:
# Get BM25 scores
scores = bm25.get_scores(query)

In [56]:
# Output the scores
print(scores)

[0.         0.         0.         0.80695034]
