In [1]:
reviews = [
    "Love the product! High quality and great service.",
    "The product was okay, but the service was terrible.",
    "Not happy with the product. It broke after a week.",
    "Absolutely fantastic! Will recommend to everyone.",
    "Poor quality. Not what I expected at all."
]


In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    words = word_tokenize(text)
    # Removing stopwords
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

preprocessed_reviews = [preprocess(review) for review in reviews]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(preprocessed_reviews)

In [5]:
import numpy as np

# Use get_feature_names for older versions of scikit-learn
feature_names = np.array(vectorizer.get_feature_names())

for i, review in enumerate(preprocessed_reviews):
    # Sort features by score
    sorted_idx = tfidf_matrix[i].toarray().flatten().argsort()[::-1]
    top_words = feature_names[sorted_idx][:3]
    print(f"Review: {review}")
    print(f"Top words: {', '.join(top_words)}\n")


Review: love product high quality great service
Top words: love, great, high

Review: product okay service terrible
Top words: okay, terrible, service

Review: happy product broke week
Top words: week, broke, happy

Review: absolutely fantastic recommend everyone
Top words: absolutely, recommend, everyone

Review: poor quality expected
Top words: expected, poor, quality

