In [None]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm

In [None]:
def load_clean_csv(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Find the first line that contains all expected headers (starts with 'Report No.')
    header_index = next(i for i, line in enumerate(lines) if 'Report No.' in line)

    # Load CSV from that line forward
    return pd.read_csv(path, skiprows=header_index)

In [None]:
# Load in Recall data

# Paths to all three files
recall_files = [
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_ArtsandCrafts.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Riding_Toys.csv",
    "../Data/Current Version of Toys Incidence+Recall/Toysandchildren_Toys.csv"
]

recall_dfs = [load_clean_csv(path) for path in recall_files]
recalls_df = pd.concat(recall_dfs, ignore_index=True)


In [None]:
# Load in reviews data
reviews_df = pd.read_pickle('../rebekah-idea-testing/reviews_raw.pkl')


Sample the Reviews Data 
- do not have enough computing power

In [None]:
reviews_sample_df = reviews_df.sample(5000)

In [None]:
# drop na in reviewtext, asin and summary column
reviews_sample_df = reviews_sample_df[['asin', 'reviewText', 'summary' ,'overall']].copy()
reviews_sample_df = reviews_sample_df.dropna(subset=['asin','reviewText', 'summary'])

In [None]:
# strip possible leading or trailing white space
reviews_model_df = reviews_sample_df[reviews_sample_df['summary'].str.strip() != '']

Embed the Summaries Row by Row

In [None]:
# initialize various packages to create embeddings on summary text
model = SentenceTransformer('all-MiniLM-L6-v2')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# embed the summary with sentence transformers
summary_embeddings = model.encode(
    reviews_model_df['summary'].tolist(),
    batch_size=32,              
    show_progress_bar=True,
    convert_to_numpy=True        
)

reviews_model_df['summary_embeddings'] = list(summary_embeddings)

Compute Sentiment of Summary 

In [None]:
# Load sentiment model
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_sent = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
def batched_sentiment_weights(texts, batch_size=64):
    sentiment_scores = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            logits = model_sent(**inputs).logits
        probs = softmax(logits, dim=1).cpu().numpy() 
        sentiment_scores.extend(probs[:, 0])  

    return sentiment_scores


In [None]:
reviews_model_df['sentiment_weight'] = batched_sentiment_weights(reviews_model_df['summary'].tolist())


Aggregate the Summary Embeddings

In [None]:
def weighted_avg_embedding(group):
    weights = np.array(group['sentiment_weight'].tolist())
    embeddings = np.stack(group['summary_embeddings'].tolist())
    if weights.sum() == 0:
        weights = np.ones_like(weights)
    return np.average(embeddings, axis=0, weights=weights)

In [None]:
agg_summary_embeddings = reviews_model_df.groupby('asin', group_keys=False).apply(
    weighted_avg_embedding, include_groups=False).reset_index()


In [None]:
agg_summary_embeddings

Make 384 Columns for The Summary Embedding

In [None]:
expanded_df = pd.DataFrame(agg_summary_embeddings[0].tolist(), index=agg_summary_embeddings.index)

In [None]:
expanded_df.columns = [f'embed_{i}' for i in range(384)]


In [None]:
final_df = pd.concat([agg_summary_embeddings['asin'], expanded_df], axis=1)

In [None]:
final_df