# Continuing with the CSV File

# Step - 5: Experiment with Text Vectorization
### Using Bag of Words (BOW) 

In [1]:
import pandas as pd

# Load the cleaned data from the CSV file
cleaned_data = pd.read_csv('cleaned_data.csv')


In [2]:
# Installing the necessary libraries
!pip install scikit-learn
!pip install sentence-transformers



In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize vectorizers
bow_vectorizer = CountVectorizer()  # For Bag-of-Words (BoW)
bow_vectors = bow_vectorizer.fit_transform(cleaned_data['cleaned_text'])


### Using TF-IDF

In [4]:
# Initialize vectorizers
# For TF-IDF
tfidf_vectorizer = TfidfVectorizer()  
tfidf_vectors = tfidf_vectorizer.fit_transform(cleaned_data['cleaned_text'])


### Using BERT based Sentence Transformation

In [5]:
from sentence_transformers import SentenceTransformer

# Initialize vectorizers
# For TF-IDF
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')
bert_vectors = bert_model.encode(cleaned_data['cleaned_text'])


In [1]:
 from sklearn.metrics.pairwise import cosine_similarity

# Function to find similar records based on a query using BERT embeddings
def find_similar_records_bert(query, df, bert_model, bert_vectors):
    # Encode the query using the BERT model
    query_vector = bert_model.encode([query])
    
    # Compute cosine similarities between the query vector and all document vectors
    cosine_similarities = cosine_similarity(query_vector, bert_vectors).flatten()
    
    # Sort the indices based on cosine similarities in descending order
    similar_records_indices = cosine_similarities.argsort()[::-1]
    
    # Get the similar records based on the sorted indices
    similar_records = df.iloc[similar_records_indices]
    
    # Add a 'similarity' column to the DataFrame
    similar_records['similarity'] = cosine_similarities[similar_records_indices]
    
    return similar_records


In [4]:
import pandas as pd

# Load the cleaned data from the CSV file
cleaned_data = pd.read_csv('cleaned_data.csv')


In [9]:
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load the cleaned data from the CSV file
cleaned_data = pd.read_csv('cleaned_data.csv')

# Initialize BERT model
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Encode cleaned text using BERT model
bert_vectors = bert_model.encode(cleaned_data['cleaned_text'])
 

# Step - 6: Data Retrieval 
### Keyboard-based Search Vectorization

In [10]:
# Define a function to prompt the user to input the query
def get_user_query():
    return input("Enter query: ")

# Prompt the user to input the query
query = get_user_query()

# Find similar records based on the query using BERT embeddings
search_results_bert = find_similar_records_bert(query, cleaned_data, bert_model, bert_vectors)

# Prompt the user to input the number of search results they want
top_n = int(input('Enter the number of search results you want: '))

# Display the top N search results for the query
print(f"Top {top_n} search results for query '{query}':")
print(search_results_bert[['name', 'similarity']].head(top_n))


Enter query: Action


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_records['similarity'] = cosine_similarities[similar_records_indices]


Enter the number of search results you want: 10
Top 10 search results for query 'Action':
                                                    name  similarity
47353  il.grande.gioco.s01.e04.contropiede.(2022).eng...    0.436760
80375  tooth.pari.when.love.bites.s01.e03.episode.1.3...    0.407746
80380  tooth.pari.when.love.bites.s01.e08.episode.1.8...    0.401364
69691  shadow.and.bone.s02.e08.no.funerals.(2023).eng...    0.399929
31258   warrior.nun.s02.e08.jeremiah.2913.(2022).eng.1cd    0.398344
16902  thai.cave.rescue.s01.e05.the.parable.of.kisa.g...    0.384536
50266  vikings.valhalla.s02.e02.towers.of.faith.(2023...    0.383630
69685  shadow.and.bone.s02.e01.no.shelter.but.me.(202...    0.379811
75811  beef.s01.e06.we.draw.a.magic.circle.(2023).eng...    0.375271
56683           class.s01.e07.episode.1.7.(2023).eng.1cd    0.369109


In [11]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to find similar records based on a query using BERT embeddings
def find_similar_records_semantic(query, cleaned_data, bert_model, bert_vectors):
    # Encode the query using BERT model
    query_vector = bert_model.encode([query])[0]
    
    # Calculate cosine similarity between query vector and all document vectors
    similarity_scores = cosine_similarity([query_vector], bert_vectors)[0]
    
    # Sort indices based on similarity scores
    sorted_indices = np.argsort(similarity_scores)[::-1]
    
    # Get similar records
    similar_records = cleaned_data.iloc[sorted_indices]
    
    return similar_records, similarity_scores[sorted_indices]


In [12]:
# Example usage:
query = "Action"
search_results_semantic, similarity_scores = find_similar_records_semantic(query, cleaned_data, bert_model, bert_vectors)

### Semantic-based Search Vectorization

In [13]:
# Function to find similar records based on a query using BERT embeddings
def find_similar_records_bert(query, df, bert_model, bert_vectors):
    # Encode the query using BERT model
    query_vector = bert_model.encode([query])
    
    # Compute cosine similarity between the query vector and all document vectors
    similarities = cosine_similarity(query_vector, bert_vectors)[0]
    
    # Sort the indices based on similarity scores in descending order
    indices = similarities.argsort()[::-1]
    
    # Extract the top similar records
    similar_records = df.iloc[indices]
    
    return similar_records


In [18]:
# Define a function to prompt the user to input the query
def get_user_query():
    return input("Enter query: ")

# Get user input for the query
query = get_user_query()

# Find similar records based on the query using BERT embeddings
search_results_bert = find_similar_records_bert(query, cleaned_data, bert_model, bert_vectors)

# Prompt the user to input the number of search results they want
top_n = int(input('Enter the number of search results you want: '))

# Display the top N search results for the query
print(f"Top {top_n} search results for query '{query}':")
print(search_results_bert['name'].head(top_n))


Enter query: Action
Enter the number of search results you want: 10
Top 10 search results for query 'Action':
47353    il.grande.gioco.s01.e04.contropiede.(2022).eng...
80375    tooth.pari.when.love.bites.s01.e03.episode.1.3...
80380    tooth.pari.when.love.bites.s01.e08.episode.1.8...
69691    shadow.and.bone.s02.e08.no.funerals.(2023).eng...
31258     warrior.nun.s02.e08.jeremiah.2913.(2022).eng.1cd
16902    thai.cave.rescue.s01.e05.the.parable.of.kisa.g...
50266    vikings.valhalla.s02.e02.towers.of.faith.(2023...
69685    shadow.and.bone.s02.e01.no.shelter.but.me.(202...
75811    beef.s01.e06.we.draw.a.magic.circle.(2023).eng...
56683             class.s01.e07.episode.1.7.(2023).eng.1cd
Name: name, dtype: object


# Step - 7: Embedding to ChromaDB

In [19]:
import chromadb
from chromadb.utils import embedding_functions
import pandas as pd

In [20]:
# Creates a my_chromadb folder in the work directory
chroma_client = chromadb.PersistentClient(path="my_chromadb")


In [21]:
# Using the distilbert-base-nli-mean-tokens model for embedding function
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="distilbert-base-nli-mean-tokens")
collection = chroma_client.get_or_create_collection(name="my_collection", embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})


In [27]:
# Loads your DataFrame directly into memory
se_emd = pd.read_csv('cleaned_data.csv' ,nrows = 10000)


In [29]:
# Adds documents from DataFrame along with metadata and IDs to the collection
collection.add(
    documents=se_emd['name'].tolist(),
    metadatas=[{"item_id": str(idx)} for idx in range(len(se_emd))],
    ids=[str(idx) for idx in range(len(se_emd))],
)


# Step - 8: Retrieving Based on Search/Query

In [30]:
# Getting user input
user_query = input("Enter your search query: ")

# Querying the collection
results = collection.query(
    query_texts=[user_query],
    n_results=10,
    include=['documents', 'distances', 'metadatas']
)

# Displaying the user input
print(f"Your search query: {user_query}")

# Displaying output documents
for document in results['documents'][0]:
    print(f" *** {document} *** ")
    

Enter your search query: Action
Your search query: Action
 *** allegoria.().eng.1cd *** 
 *** allegoria.().eng.1cd *** 
 *** allegoria.().eng.1cd *** 
 *** we.are.gathered.here.today.().eng.1cd *** 
 *** we.are.gathered.here.today.().eng.1cd *** 
 *** epoch.(2001).eng.1cd *** 
 *** the.message.(1976).eng.1cd *** 
 *** event.horizon.(1997).eng.1cd *** 
 *** event.horizon.(1997).eng.1cd *** 
 *** event.horizon.(1997).eng.1cd *** 
