In [3]:
!pip install spacy pandas scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [9]:
# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv('Reviews.csv')

# Remove missing review texts
df = df.dropna(subset=['Text'])

# Limit to first 1000 records for speed
df = df.head(1000).reset_index(drop=True)


In [10]:
def preprocess_text(text):
    doc = nlp(text.lower())
    tokens = []
    for token in doc:
        # Keep alphabetic tokens only, remove stopwords, lemmatize
        if token.is_alpha and not token.is_stop:
            tokens.append(token.lemma_)
    return " ".join(tokens)

# Apply preprocessing to reviews
df['cleaned_text'] = df['Text'].apply(preprocess_text)

In [11]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['cleaned_text'])

In [12]:
def process_query(query):
    clean_query = preprocess_text(query)
    query_vec = vectorizer.transform([clean_query])
    return query_vec

In [13]:
def get_top_k_reviews(query, k=5):
    query_vec = process_query(query)
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_k_indices = similarity_scores.argsort()[-k:][::-1]

    results = []
    for idx in top_k_indices:
        results.append({
            'Review': df.loc[idx, 'Text'],
            'Score': df.loc[idx, 'Score'],
            'Similarity': similarity_scores[idx]
        })
    return results

In [14]:
user_query = "great taste and quality"
top_reviews = get_top_k_reviews(user_query, k=5)

for i, review_info in enumerate(top_reviews, 1):
    print(f"Top {i} Review (Similarity: {review_info['Similarity']:.3f}, Score: {review_info['Score']}):")
    print(review_info['Review'])
    print("="*80)

Top 1 Review (Similarity: 0.346, Score: 5):
I'm happy with the quality of the product and the price. Like the other reviewer, I would prefer if there was a plastic liner to preserve freshness. However, I will continue to buy this product regardless as it is quality oatmeal at a good price.<br /><br />Edit: I'm on my 4th bag, quality continues to be high.
Top 2 Review (Similarity: 0.314, Score: 5):
this has to be one of the best teas I have ever tasted... it's clean, bright, fresh...<br /><br />great delivery...again quality... just try it...
Top 3 Review (Similarity: 0.311, Score: 5):
This offer is a great price and a great taste, thanks Amazon for selling this product.<br /><br />Staral
Top 4 Review (Similarity: 0.298, Score: 5):
This  is great stuff.  Made some really tasty banana bread.  Good quality and lowest price in town.
Top 5 Review (Similarity: 0.269, Score: 4):
I think Plocky's brand as a whole is a cut above in quality but I found these to lack in taste somewhat and be a li