# Documentation for `SummaryEmbeddingColumns.ipynb`

## Overview

This notebook processes Amazon product review summaries to generate sentiment-weighted sentence embeddings for each product. 

## Main Steps

We use the `all-MiniLM-L6-v2` SentenceTransformer model to embed each review summary into a 384-dimensional vector. We then use the `cardiffnlp/twitter-roberta-base-sentiment` model to compute sentiment scores for each summary. The negative sentiment probability is used as a weight for each summary.

For each ASIN, we then compute a sentiment-weighted average of all summary embeddings, resulting in one embedding vector per product.

## Input Files

- `../Data/amazon_reviews.json` — Raw Amazon review data.
- `../Data/asin_labels_clean_review_df.csv` — Cleaned ASIN labels.
- `../Data/cpsc_data/incident_reports/*.csv` — CPSC incident reports (for reference).

## Output Files

- `../Data/agg_summary_embeddings.pkl` — Pickle file containing the final DataFrame with ASINs and their 384-dimensional summary embeddings.



In [None]:
import nltk
nltk.download('punkt')       
nltk.download('stopwords') 
nltk.download('punkt_tab')  
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import Counter
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Betul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Betul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Betul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Betul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Betul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Betul\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_pe

In [None]:
file_path = '../Data/amazon_reviews.json'  
reviews_df = pd.read_json(file_path, lines=True, compression=None)

In [3]:
def load_clean_csv(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Find the first line that contains all expected headers (starts with 'Report No.')
    header_index = next(i for i, line in enumerate(lines) if 'Report No.' in line)

    # Load CSV from that line forward
    return pd.read_csv(path, skiprows=header_index)

In [None]:
# Paths to all three files
recall_files = [
    "../Data/cpsc_data/incident_reports/Toysandchildren_ArtsandCrafts.csv",
    "../Data/cpsc_data/incident_reports/Toysandchildren_Riding_Toys.csv",
    "../Data/cpsc_data/incident_reports/Toysandchildren_Toys.csv"
]

recall_dfs = [load_clean_csv(path) for path in recall_files]
recalls_df = pd.concat(recall_dfs, ignore_index=True)

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
reviews_df = reviews_df[['asin', 'reviewText', 'summary' ,'overall']].copy()
reviews_df = reviews_df.dropna(subset=['asin','reviewText', 'summary'])

### Upload Cleaned Asins

In [None]:
asin_labels_clean_review_df = pd.read_csv('../Data/asin_labels_clean_review_df.csv')

In [9]:
asin_labels_clean_review_df.shape

(614658, 3)

In [10]:
# filter to get asins in clean asins
reviews_df = reviews_df[reviews_df['asin'].isin(asin_labels_clean_review_df['asin'])]

In [11]:
reviews_df.shape

(8172849, 4)

In [27]:
# strip possible leading or trailing white space
reviews_model_df = reviews_df[reviews_df['summary'].str.strip() != '']

In [29]:
# initialize various packages to create embeddings on summary text
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



In [33]:
# embed the summary with sentence transformers
summary_embeddings = model.encode(
    reviews_model_df['summary'].tolist(),
    batch_size=32,              
    show_progress_bar=True,
    convert_to_numpy=True        
)

Batches:   0%|          | 0/255401 [00:00<?, ?it/s]

In [34]:
reviews_model_df['summary_embeddings'] = list(summary_embeddings)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_model_df['summary_embeddings'] = list(summary_embeddings)


## Compute Sentiment of Summary

In [35]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model and tokenizer
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_sent = AutoModelForSequenceClassification.from_pretrained(model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_sent = model_sent.to(device)




In [None]:
"""
def batched_sentiment_weights(texts, batch_size=64):
    sentiment_scores = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True)
        with torch.no_grad():
            logits = model_sent(**inputs).logits
        probs = softmax(logits, dim=1).cpu().numpy() 
        sentiment_scores.extend(probs[:, 0])  

    return sentiment_scores
"""

In [36]:
import torch
from torch.nn.functional import softmax
from tqdm import tqdm

def batched_sentiment_weights(texts, batch_size=64):
    sentiment_scores = []

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_sent.to(device)  # Ensure model is on the correct device

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]

        # Tokenize and move inputs to the correct device
        inputs = tokenizer(batch_texts, return_tensors='pt', truncation=True, padding=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Run inference
        with torch.no_grad():
            logits = model_sent(**inputs).logits

        # Get probability of negative sentiment (index 0)
        probs = softmax(logits, dim=1).cpu().numpy()
        sentiment_scores.extend(probs[:, 0])  # or change index for different sentiment

    return sentiment_scores


In [37]:
reviews_model_df['sentiment_weight'] = batched_sentiment_weights(reviews_model_df['summary'].tolist())

  0%|                                                                                       | 0/127701 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████████████████████████████████████████████████████████████████████| 127701/127701 [48:52<00:00, 43.55it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reviews_model_df['sentiment_weight'] = batched_sentiment_weights(reviews_model_df['summary'].tolist())


In [42]:
def weighted_avg_embedding(group):
    weights = np.array(group['sentiment_weight'].tolist())
    embeddings = np.stack(group['summary_embeddings'].tolist())
    if weights.sum() == 0:
        weights = np.ones_like(weights)
    return np.average(embeddings, axis=0, weights=weights)

In [43]:
agg_summary_embeddings = reviews_model_df.groupby('asin', group_keys=False).apply(
    weighted_avg_embedding, include_groups=False)

In [44]:
agg_summary_embeddings

asin
0000191639    [-0.023288769647479057, 0.014182460494339466, ...
0004950763    [-0.08644748479127884, -0.009291399270296097, ...
0004983289    [-0.0328253600229539, 0.013325432199417255, -0...
0005069491    [-0.06845328211784363, -0.06251281499862671, 0...
0020232233    [-0.033952606099107614, 0.035724348085384536, ...
                                    ...                        
B01HJDFWDK    [-0.051193756597999227, -0.0038128394320709443...
B01HJDGVFS    [-0.05726094457197995, -0.009238787646112264, ...
B01HJDUNRU    [-0.030438451488812853, 0.022333641545807352, ...
B01HJFAGJI    [-0.007043351280748524, 0.03934371761938393, -...
B01HJHA7GI    [-0.09151761642866257, 0.00693787157468007, 0....
Length: 614657, dtype: object

In [51]:
agg_summary_embeddings.shape

(614657,)

### Make 384 Columns for The Summary Embedding

In [53]:
expanded_df = pd.DataFrame(agg_summary_embeddings.tolist(), index=agg_summary_embeddings.index)

In [55]:
expanded_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191639,-0.023289,0.014182,-0.045827,0.004536,-0.059605,0.021759,0.062154,-0.047039,-0.048791,0.025406,...,0.076976,-0.025515,0.004724,0.026689,-0.033182,0.019103,0.079272,0.12695,0.03077,0.022833
4950763,-0.086447,-0.009291,0.025128,0.071182,-0.010236,0.034107,0.092472,-0.090956,0.009852,0.018048,...,0.004744,-0.018429,-0.019864,-0.008816,0.020599,0.028047,0.087291,-0.032195,0.001828,0.052647
4983289,-0.032825,0.013325,-0.000896,0.053275,-0.046219,0.025191,0.079885,-0.070505,0.024829,0.002191,...,0.01098,-0.008946,-0.023322,0.01125,0.010961,0.017847,0.098809,-0.034606,-0.001182,0.051882
5069491,-0.068453,-0.062513,0.055283,0.04573,-0.081839,-0.045375,0.076266,-0.069091,-0.059432,-0.003694,...,-0.059335,0.031384,-0.004571,0.078,-0.053182,0.051308,0.046214,0.022991,-0.108413,-0.007681
20232233,-0.033953,0.035724,0.000368,0.000224,0.005743,0.01383,0.082713,0.00496,0.021002,-0.018459,...,0.022737,-0.028082,0.023483,0.006377,-0.031809,0.014866,0.05141,-0.06301,0.00067,0.030218


In [73]:
expanded_df.columns = [f's_embed_{i}' for i in range(384)]

In [75]:
final_df = expanded_df.reset_index()

In [77]:
final_df.head()

Unnamed: 0,asin,s_embed_0,s_embed_1,s_embed_2,s_embed_3,s_embed_4,s_embed_5,s_embed_6,s_embed_7,s_embed_8,...,s_embed_374,s_embed_375,s_embed_376,s_embed_377,s_embed_378,s_embed_379,s_embed_380,s_embed_381,s_embed_382,s_embed_383
0,191639,-0.023289,0.014182,-0.045827,0.004536,-0.059605,0.021759,0.062154,-0.047039,-0.048791,...,0.076976,-0.025515,0.004724,0.026689,-0.033182,0.019103,0.079272,0.12695,0.03077,0.022833
1,4950763,-0.086447,-0.009291,0.025128,0.071182,-0.010236,0.034107,0.092472,-0.090956,0.009852,...,0.004744,-0.018429,-0.019864,-0.008816,0.020599,0.028047,0.087291,-0.032195,0.001828,0.052647
2,4983289,-0.032825,0.013325,-0.000896,0.053275,-0.046219,0.025191,0.079885,-0.070505,0.024829,...,0.01098,-0.008946,-0.023322,0.01125,0.010961,0.017847,0.098809,-0.034606,-0.001182,0.051882
3,5069491,-0.068453,-0.062513,0.055283,0.04573,-0.081839,-0.045375,0.076266,-0.069091,-0.059432,...,-0.059335,0.031384,-0.004571,0.078,-0.053182,0.051308,0.046214,0.022991,-0.108413,-0.007681
4,20232233,-0.033953,0.035724,0.000368,0.000224,0.005743,0.01383,0.082713,0.00496,0.021002,...,0.022737,-0.028082,0.023483,0.006377,-0.031809,0.014866,0.05141,-0.06301,0.00067,0.030218


In [79]:
final_df.shape

(614657, 385)

In [None]:
import pickle

with open("../Data/agg_summary_embeddings.pkl", "wb") as f:
    pickle.dump(final_df, f)

In [71]:
print(type(final_df['embed_0'].iloc[0]))

<class 'numpy.float64'>


In [None]:
import pandas as pd
final_df = pd.read_pickle("../Data/agg_summary_embeddings.pkl")

In [3]:
final_df.shape

(614657, 385)

In [7]:
final_df.columns

Index(['asin', 'embed_0', 'embed_1', 'embed_2', 'embed_3', 'embed_4',
       'embed_5', 'embed_6', 'embed_7', 'embed_8',
       ...
       'embed_374', 'embed_375', 'embed_376', 'embed_377', 'embed_378',
       'embed_379', 'embed_380', 'embed_381', 'embed_382', 'embed_383'],
      dtype='object', length=385)

In [9]:
print(type(final_df['embed_0'].iloc[0]))

<class 'numpy.float64'>
