In [None]:
!pip install bertopic

In [None]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from bertopic import BERTopic

In [None]:
reviews_df = pd.read_pickle('reviews_raw.pkl')
reviews_df['asin'].nunique()

Bertopic Attempt

In [None]:
# pick a random asin from the DataFrame
reviews_df['asin'].sample(1)

In [None]:
# make a df based on the random asin
test_df = reviews_df.loc[reviews_df['asin'] == 'B00ICDB1QO'].copy()
test_df

In [None]:
bertopic_df = test_df.copy()

In [None]:
# remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(data):
    output_array = []
    for sentence in data:
        temp_list = []
        for word in str(sentence).split():
            if word.lower() not in stop_words:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [None]:
reviewText = bertopic_df['reviewText'].to_list()

In [None]:
reviewText

In [None]:
reviewText = remove_stopwords(reviewText)

In [None]:
# initialize the model
test_model = BERTopic(language='english', nr_topics='auto')

In [None]:
topics, probabilities = test_model.fit_transform(reviewText)

In [None]:
test_model.update_topics(reviewText, topics=topics, n_gram_range=(1,2))

In [None]:
test_model.get_topic_freq()

In [None]:
test_model.get_topic(1)

In [None]:
test_model.get_topic(0)

In [None]:
for x in range(0, 2):
    first_tuple_elements = []
    for tuple in test_model.get_topic(x):
        first_tuple_elements.append(tuple[0])
    print(first_tuple_elements)
    print("\n")

hugging face

In [None]:
!pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:

model = SentenceTransformer('all-MiniLM-L6-v2')
reviews_list = test_df['reviewText'].to_list()
embeddings = model.encode(reviews_list)
embeddings.shape

In [None]:
# aggregate review embeddings per product
# option 1: average pooling
# embed each review for the product and then average those embeddings

avg_pooling_model = test_df.copy()
avg_pooling_model['embedding'] = avg_pooling_model['reviewText'].apply(lambda x: model.encode(x))


In [None]:
avg_pooling_model.columns

In [None]:
product_embeddings = (
    avg_pooling_model.groupby('asin')['embedding']
    .apply(lambda x: np.mean(np.stack(x.values), axis=0))
    .to_dict()
)

In [None]:
product_embeddings

In [None]:
# option 2: max pooling approach
def max_pooling(embeddings):
    stacked = np.stack(embeddings)  # shape: (num_reviews, 384)
    return np.max(stacked, axis=0)  # shape: (384,)

# Group by product and apply max pooling
max_pooled_embeddings = (
    avg_pooling_model.groupby('asin')['embedding']
    .apply(lambda x: max_pooling(x.values))
    .to_dict()
)

In [None]:
#option 3
def weighted_average_pooling(embeddings, weights):
    stacked = np.stack(embeddings)  # shape: (num_reviews, 384)
    weights = np.array(weights)
    weights = weights / weights.sum()  # Normalize
    return np.average(stacked, axis=0, weights=weights)

# Group and apply weighted average
weighted_embeddings = {}

grouped = avg_pooling_model.groupby('asin')
for product_id, group in grouped:
    embeddings = group['embedding'].tolist()
    ratings = group['overall'].tolist()
    weighted_embeddings[product_id] = weighted_average_pooling(embeddings, ratings)


final choice possibly

In [None]:
final_model_df = reviews_df.sample(100000)


In [None]:
final_model_df['asin'].nunique()

In [None]:
final_model_df.shape

In [None]:
# drop na in reviewtext column
final_model_df = final_model_df[['asin', 'reviewText', 'overall','vote']]
final_model_df = final_model_df.dropna(subset=['asin','reviewText'])

In [None]:
final_model_df = final_model_df[final_model_df['reviewText'].str.strip() != '']
final_model_df

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
final_model_df['embedding'] = final_model_df['reviewText'].apply(lambda x: model.encode(x, show_progress_bar=False))

In [None]:
# Aggregate embeddings per ASIN (mean)
final_model_df['embedding'] = final_model_df['embedding'].apply(lambda x: np.array(x))
asin_embeddings = final_model_df.groupby('asin')['embedding'].apply(lambda x: np.mean(np.stack(x), axis=0)).reset_index()


In [None]:
asin_embeddings

In [None]:
!pip install vaderSentiment


In [None]:
# other features to consider
# avg_review_length, avg_sentence_count, punctuation_density
# capture sentiment like tone or emotion not fully captured in embedding: avg_sentiment_score

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Apply VADER to each review
def get_sentiment_scores(text):
    return analyzer.polarity_scores(text)

sentiment_df = final_model_df['reviewText'].apply(get_sentiment_scores).apply(pd.Series)

# Add to main dataframe
final_model_df = pd.concat([final_model_df, sentiment_df], axis=1)

# Group by ASIN and aggregate
asin_sentiment = final_model_df.groupby('asin')[['compound', 'pos', 'neu', 'neg']].mean().reset_index()

# (Optional) Add std deviation if you want:
sentiment_std = final_model_df.groupby('asin')['compound'].std().reset_index().rename(columns={'compound': 'compound_std'})
asin_sentiment = asin_sentiment.merge(sentiment_std, on='asin', how='left')



In [None]:
asin_sentiment

In [None]:
asin_embeddings

In [None]:
design_matrix_test = pd.merge(asin_embeddings, asin_sentiment, on='asin')

In [None]:
design_matrix_test