In [None]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
import pandas as pd
from helper_functions import *
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter

In [None]:
# Load in Recall data

# Paths to all three files
recall_files = [
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_ArtsandCrafts.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Riding_Toys.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Toys.csv"
]

recall_dfs = [load_clean_csv(path) for path in recall_files]
recalls_df = pd.concat(recall_dfs, ignore_index=True)



In [None]:
# Load in reviews data
reviews_df = pd.read_pickle('reviews_raw.pkl')
reviews_df['asin'].nunique()

In [None]:
# initialize model to create embeddings on incident description text
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# from the recalls data, embed the incident description
combined_indicent_text = " ".join(recalls_df['Incident Description'].dropna().tolist())
incident_desc_embedding = model.encode(combined_indicent_text)
incident_desc_embedding = np.array(incident_desc_embedding).reshape(1,-1)


In [None]:
# preprocess the complaints data to remove stop words and get down to lemm
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# Apply preprocessing
incidents = recalls_df['Incident Description'].dropna().astype(str)
all_tokens = incidents.apply(preprocess)

# Flatten to single list of tokens
flattened_tokens = [token for sublist in all_tokens for token in sublist]


In [None]:
# Optional: count top words
word_freq = Counter(flattened_tokens)
top_words = word_freq.most_common(20)
print(top_words)

In [None]:
list(word_freq.keys())

In [None]:
# chat gpt returns the negative words from word_freq
negative_words = [
    'choke', 'hazard', 'dangerous', 'danger', 'dermatitis', 'bother',
    'accidentally', 'ingest', 'notorious', 'toxic', 'warn', 'cause',
    'allergic', 'reaction', 'rash', 'sensitization', 'occur',
    'seek', 'medical', 'die', 'poison', 'elevated', 'burn', 'urgent',
    'treatment', 'pinch', 'pinched', 'slice', 'lacerate', 'moldy',
    'waste', 'black', 'spot', 'bleed', 'miss', 'sharp', 'metal',
    'damage', 'difficulty', 'injure', 'inconvenience', 'serious',
    'return', 'disagree', 'concern', 'broken', 'shatter', 'remove',
    'unsafe', 'terrible', 'odor', 'infuriate', 'infection', 'irritation',
    'cough', 'irritate', 'headache', 'chemical', 'blister', 'bleeding',
    'sick', 'asthma', 'attack', 'pain', 'scar', 'nasty', 'impact',
    'accident', 'penetrate', 'trapping', 'ignite', 'overheat',
    'fire', 'fail', 'explode', 'burning', 'puncture', 'swollen',
    'wound', 'injury', 'hurt', 'sore', 'contaminate', 'vomit', 'bleed',
    'allergy', 'toxic', 'deadly', 'severe', 'dyshidrotic', 'eczema',
    'bacterial', 'disapointing', 'poorly', 'redness', 'burnt',
    'complain', 'bad', 'dangerously', 'emergency', 'hospital'
]


In [None]:
# another way is to do thematic classification

from sklearn.cluster import KMeans

test_embeddings = model.encode(recalls_df['Incident Description'].tolist())
kmeans = KMeans(n_clusters=5)
recalls_df['cluster'] = kmeans.fit_predict(test_embeddings)



In [None]:
for cluster_num in range(5):
    print(f"\nCluster {cluster_num} samples:")
    sample_texts = recalls_df[recalls_df['cluster'] == cluster_num]['Incident Description'].head(5)
    for text in sample_texts:
        print("-", text)


In [None]:
from collections import Counter
import re

def get_top_words(texts, n=10):
    all_words = ' '.join(texts).lower()
    words = re.findall(r'\b\w+\b', all_words)
    stopwords = set(['the', 'and', 'is', 'to', 'in', 'of', 'a', 'for', 'on', 'with', 'it', 'this', 'that'])  # add more stopwords as needed
    filtered_words = [w for w in words if w not in stopwords and len(w) > 2]
    common_words = Counter(filtered_words).most_common(n)
    return [w[0] for w in common_words]

for cluster_num in range(5):
    texts = recalls_df[recalls_df['cluster'] == cluster_num]['Incident Description']
    print(f"\nTop words for cluster {cluster_num}: {get_top_words(texts)}")


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(incidents)

lda = LatentDirichletAllocation(n_components=40, random_state=0)
lda.fit(X)

# Get keywords per topic
words = vectorizer.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-10:]]
    print(f"Topic {i+1}: {top_words}")


In [None]:
# make a smaller reviews dataframe
reviews_sample_df = reviews_df.sample(5000)

In [None]:
# TODO: should we be dropping duplicates?
reviews_sample_df[reviews_sample_df.duplicated(['asin', 'reviewText', 'summary'])]

In [None]:
# drop na in reviewtext, asin and summary column
reviews_sample_df = reviews_sample_df[['asin', 'reviewText', 'summary' ,'overall']].copy()
reviews_sample_df = reviews_sample_df.dropna(subset=['asin','reviewText', 'summary'])

In [None]:
# strip possible leading or trailing white space
reviews_model_df = reviews_sample_df[reviews_sample_df['reviewText'].str.strip() != '']
reviews_model_df = reviews_model_df[reviews_model_df['summary'].str.strip() != '']

In [None]:
# embed the reviewText
reviews_embeddings = model.encode(
    reviews_model_df['reviewText'].tolist(),
    batch_size=32,    #32, 64, 128 based on memory           
    show_progress_bar=True,
    convert_to_numpy=True        
)
reviews_model_df['reviews_embeddings'] = [vec for vec in reviews_embeddings]
reviews_embeddings = np.array(reviews_embeddings)
reviews_embeddings = np.vstack(reviews_embeddings)

In [None]:
# embed the summary column
# reviews_model_df['summary_embedding'] = reviews_model_df['summary'].apply(lambda x: model.encode(x, show_progress_bar=False))

In [None]:
# take cosine similarity between reviews and the incident description embedding
review_similarities = cosine_similarity(incident_desc_embedding, reviews_embeddings)[0]

In [None]:
# put the cosine similarities between review and incidents on the main df
reviews_model_df['review_cosine_sim'] = review_similarities

In [None]:
reviews_model_df

In [None]:
# check number of reviews per product
reviews_model_df.groupby('asin')['reviewText'].count().describe()

In [None]:
aggregation_df = reviews_model_df.groupby('asin')['review_cosine_sim'].agg(
    mean_similarity = 'mean',
    max_similarity = 'max'
).reset_index()

In [None]:
aggregation_df

In [None]:
from tqdm import tqdm
# classify the summary in the reviews data
batch_size = 8
texts = reviews_model_df['summary'].fillna('').tolist()
original_indices = reviews_model_df.index.tolist()

scored_rows = []
for i in tqdm(range(0, len(texts), batch_size)):
    batch = texts[i:i+batch_size]
    batch_indices = original_indices[i:i+batch_size]
    
    scores = classify_batch_all_scores(batch)
    
    for idx, score_dict in zip(batch_indices, scores):
        scored_rows.append((idx, score_dict))

label_scores_df = pd.DataFrame(
    [score_dict for idx, score_dict in scored_rows],
    index=[idx for idx, score_dict in scored_rows]
)

reviews_model_df = reviews_model_df.join(label_scores_df)







In [None]:
reviews_model_df

In [None]:
# other features to consider
# avg_review_length, avg_sentence_count, punctuation_density
# capture sentiment like tone or emotion not fully captured in embedding: avg_sentiment_score

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Apply VADER to each review
def get_sentiment_scores(text):
    return analyzer.polarity_scores(text)

sentiment_df = final_model_df['reviewText'].apply(get_sentiment_scores).apply(pd.Series)

# Add to main dataframe
final_model_df = pd.concat([final_model_df, sentiment_df], axis=1)

# Group by ASIN and aggregate
asin_sentiment = final_model_df.groupby('asin')[['compound', 'pos', 'neu', 'neg']].mean().reset_index()

# (Optional) Add std deviation if you want:
sentiment_std = final_model_df.groupby('asin')['compound'].std().reset_index().rename(columns={'compound': 'compound_std'})
asin_sentiment = asin_sentiment.merge(sentiment_std, on='asin', how='left')

