In [1]:
!pip install nltk scikit-learn
import pandas as pd
import numpy as np
import nltk
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity




In [2]:
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tcs\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
df = pd.read_csv("Reviews.csv") 

In [5]:
# Keep only 'Text' column and remove nulls
df_reviews = df[['Text']].dropna()

# Limit to 10,000 reviews
df_reviews = df_reviews.head(10000)

df_reviews.reset_index(drop=True, inplace=True)


In [6]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


In [7]:
import re

def preprocess_text(text):
    # a. Lowercase
    text = text.lower()
    # b. Remove punctuation/special characters
    text = re.sub(r'[^a-z\s]', '', text)
    # c. Tokenize
    tokens = nltk.word_tokenize(text)
    # d. Remove stopwords
    filtered = [word for word in tokens if word not in stop_words]
    # e. Join tokens back
    return ' '.join(filtered)


In [8]:
df_reviews['cleaned_text'] = df_reviews['Text'].apply(preprocess_text)


In [9]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_reviews['cleaned_text'])


In [10]:
def retrieve_reviews(query, top_k=5):
    # a. Preprocess query
    cleaned_query = preprocess_text(query)
    # b. Convert to vector
    query_vec = vectorizer.transform([cleaned_query])
    # c. Compute similarity
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    # d. Get top k
    top_indices = similarity_scores.argsort()[-top_k:][::-1]
    # e. Return original + cleaned reviews
    return df_reviews.iloc[top_indices][['Text', 'cleaned_text']]


In [11]:
# Example 1
print("🔍 Query: 'great product with fast shipping'")
print(retrieve_reviews("great product with fast shipping"))

# Example 2
print("\n🔍 Query: 'disappointed'")
print(retrieve_reviews("disappointed"))


🔍 Query: 'great product with fast shipping'
                                                   Text  \
5226  Enjoyed the product and they also provided ver...   
8021  The tea is good and fresh. We enjoy it. The sh...   
7073  My daughter lives in Hawaii and sent me some g...   
6034  The energy drink is a great product. The shipp...   
9878  Fast shipping, items were packaged nicely and ...   

                                           cleaned_text  
5226  enjoyed product also provided fast shipping im...  
8021  tea good fresh enjoy shipping fast cost reason...  
7073  daughter lives hawaii sent great coffee keurig...  
6034  energy drink great product shipping price craz...  
9878  fast shipping items packaged nicely described ...  

🔍 Query: 'disappointed'
                                                   Text  \
3151  I am a bit disappointed.  The flavor was not w...   
4378  The product is very good. Way too expensive an...   
6548  Disappointed.  The big boxes had a very diffe