In [30]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
import pandas as pd
from helper_functions import *
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rebekaheichberg/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rebekaheichberg/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rebekaheichberg/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rebekaheichberg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/rebekaheichberg/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/rebekaheichberg/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data

Load in the Data

In [31]:
# Load in Recall data

# Paths to all three files
recall_files = [
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_ArtsandCrafts.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Riding_Toys.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Toys.csv"
]

recall_dfs = [load_clean_csv(path) for path in recall_files]
recalls_df = pd.concat(recall_dfs, ignore_index=True)



In [None]:
# Load in reviews data
reviews_df = pd.read_pickle('reviews_raw.pkl')
reviews_df['asin'].nunique()

Embed the Incident Description

In [None]:
# initialize model to create embeddings on incident description text
model = SentenceTransformer('all-MiniLM-L6-v2')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# from the recalls data, embed the incident description
combined_indicent_text = " ".join(recalls_df['Incident Description'].dropna().tolist())
incident_desc_embedding = model.encode(combined_indicent_text)
incident_desc_embedding = np.array(incident_desc_embedding).reshape(1,-1)


From Incident Description, we generate a dictionary of words and their frequences. Then use LLM to extract negative words

In [None]:
# Apply preprocessing
incidents = recalls_df['Incident Description'].dropna().astype(str)
all_tokens = incidents.apply(preprocess)

# Flatten to single list of tokens
flattened_tokens = [token for sublist in all_tokens for token in sublist]
word_freq = Counter(flattened_tokens)
top_words = word_freq.most_common(20)
print(top_words)

# Give this list to a LLM to extract negative words
list(word_freq.keys())

In [None]:
# chat gpt returns the negative words from word_freq
negative_words = [
    'choke', 'hazard', 'dangerous', 'danger', 'dermatitis', 'bother',
    'accidentally', 'ingest', 'notorious', 'toxic', 'warn', 'cause',
    'allergic', 'reaction', 'rash', 'sensitization', 'occur',
    'seek', 'medical', 'die', 'poison', 'elevated', 'burn', 'urgent',
    'treatment', 'pinch', 'pinched', 'slice', 'lacerate', 'moldy',
    'waste', 'black', 'spot', 'bleed', 'miss', 'sharp', 'metal',
    'damage', 'difficulty', 'injure', 'inconvenience', 'serious',
    'return', 'disagree', 'concern', 'broken', 'shatter', 'remove',
    'unsafe', 'terrible', 'odor', 'infuriate', 'infection', 'irritation',
    'cough', 'irritate', 'headache', 'chemical', 'blister', 'bleeding',
    'sick', 'asthma', 'attack', 'pain', 'scar', 'nasty', 'impact',
    'accident', 'penetrate', 'trapping', 'ignite', 'overheat',
    'fire', 'fail', 'explode', 'burning', 'puncture', 'swollen',
    'wound', 'injury', 'hurt', 'sore', 'contaminate', 'vomit', 'bleed',
    'allergy', 'toxic', 'deadly', 'severe', 'dyshidrotic', 'eczema',
    'bacterial', 'disapointing', 'poorly', 'redness', 'burnt',
    'complain', 'bad', 'dangerously', 'emergency', 'hospital'
]


Take a SAMPLE of the reviews data to check the approach

In [None]:
# make a smaller reviews dataframe
reviews_sample_df = reviews_df.sample(5000)

In [None]:
# Load sentiment model
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_sent = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
def compute_sentiment_weight(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True)
    with torch.no_grad():
        logits = model_sent(**inputs).logits
    probs = softmax(logits, dim=1).numpy().flatten()
    return probs[0]  



In [None]:
# TODO: should we be dropping duplicates?
reviews_sample_df[reviews_sample_df.duplicated(['asin', 'reviewText', 'summary'])]

In [None]:
# drop na in reviewtext, asin and summary column
reviews_sample_df = reviews_sample_df[['asin', 'reviewText', 'summary' ,'overall']].copy()
reviews_sample_df = reviews_sample_df.dropna(subset=['asin','reviewText', 'summary'])

In [None]:
# strip possible leading or trailing white space
reviews_model_df = reviews_sample_df[reviews_sample_df['reviewText'].str.strip() != '']
reviews_model_df = reviews_model_df[reviews_model_df['summary'].str.strip() != '']

In [None]:
# Assuming reviews_model_df['summary'] contains review titles
reviews_model_df['sentiment_weight'] = reviews_model_df['summary'].apply(compute_sentiment_weight)


In [None]:
# embed the summary
summary_embeddings = model.encode(
    reviews_model_df['summary'].tolist(),
    batch_size=32,    #32, 64, 128 based on memory           
    show_progress_bar=True,
    convert_to_numpy=True        
)
# reviews_model_df['summary_embeddings'] = [vec for vec in summary_embeddings]
reviews_model_df['summary_embeddings'] = list(summary_embeddings)
# summary_embeddings = np.array(summary_embeddings)
# summary_embeddings = np.vstack(summary_embeddings)

In [None]:
def weighted_avg_embedding(group):
    weights = np.array(group['sentiment_weight'].tolist())
    embeddings = np.stack(group['summary_embeddings'].tolist())
    if weights.sum() == 0:
        weights = np.ones_like(weights)
    return np.average(embeddings, axis=0, weights=weights)

product_embeddings = (
    reviews_model_df
    .groupby('asin')
    .apply(weighted_avg_embedding)
)

product_embedding_matrix = np.vstack(product_embeddings.tolist())


In [None]:
product_embeddings

In [None]:
# combined_incident_text = " ".join(recalls_df['Incident Description'].dropna().tolist())


# incident_desc_embedding = model.encode(
#     [combined_incident_text],  
#     convert_to_numpy=True
# )


# incident_desc_embedding = np.array(incident_desc_embedding).reshape(1, -1)


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# # Compute cosine similarity
# similarities = cosine_similarity(product_embedding_matrix, incident_desc_embedding)

# # Format as DataFrame
# similarity_df = pd.DataFrame({
#     'asin': product_embeddings.index,
#     'similarity_to_incidents': similarities.flatten()
# })


Review Text Experimentation

In [None]:
# make reviewText embeddings# embed the summary
reviewtext_embeddings = model.encode(
    reviews_model_df['reviewText'].tolist(),
    batch_size=32,    #32, 64, 128 based on memory           
    show_progress_bar=True,
    convert_to_numpy=True        
)
# reviews_model_df['summary_embeddings'] = [vec for vec in summary_embeddings]
reviews_model_df['reviewtext_embeddings'] = list(reviewtext_embeddings)
# summary_embeddings = np.array(summary_embeddings)
# summary_embeddings = np.vstack(summary_embeddings)

In [None]:
# take cosine similarity between reviews and the incident description embedding
review_similarities = cosine_similarity(incident_desc_embedding, reviewtext_embeddings)[0]

In [None]:
# put the cosine similarities between review and incidents on the main df
reviews_model_df['review_cosine_sim'] = review_similarities

In [None]:
reviews_model_df = reviews_model_df.sort_values('review_cosine_sim', ascending=False)

In [None]:
reviews_model_df[['reviewText', 'summary', 'review_cosine_sim']].to_clipboard()

In [None]:
# check number of reviews per product
reviews_model_df.groupby('asin')['reviewText'].count().describe()

In [None]:
aggregation_df = reviews_model_df.groupby('asin')['review_cosine_sim'].agg(
    mean_similarity = 'mean',
    max_similarity = 'max'
).reset_index()

In [None]:
aggregation_df

In [None]:
reviews_model_df

Methods We are not Using

In [None]:
# another way is to do thematic classification

from sklearn.cluster import KMeans
from collections import Counter
import re
cluster = 5
test_embeddings = model.encode(recalls_df['Incident Description'].tolist())
kmeans = KMeans(n_clusters=cluster)
recalls_df['cluster'] = kmeans.fit_predict(test_embeddings)

for cluster_num in range(cluster):
    sample_texts = recalls_df[recalls_df['cluster'] == cluster_num]['Incident Description']

def get_top_words(texts, n=cluster):
    all_words = ' '.join(texts).lower()
    words = re.findall(r'\b\w+\b', all_words)
    stop_words = set(stopwords.words('english'))  
    filtered_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    common_words = Counter(filtered_words).most_common(n)
    return [w[0] for w in common_words]

for cluster_num in range(cluster):
    texts = recalls_df[recalls_df['cluster'] == cluster_num]['Incident Description']
    print(f"\nTop words for cluster {cluster_num}: {get_top_words(texts)}")



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(incidents)

lda = LatentDirichletAllocation(n_components=40, random_state=0)
lda.fit(X)

# Get keywords per topic
words = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-10:]]
    print(f"Topic {i+1}: {top_words}")
