### Creat a Review-key mapping where a key is a unique id (asin_reviewerID) and value is a list  - [reviewtext, summary]

In [2]:
import pandas as pd
from tqdm import tqdm



In [5]:
pd.set_option('display.max_colwidth', None)
review_df = pd.read_csv('../Project_Data/MiniReview.csv', index_col=0)
metadata_df = pd.read_csv('../Project_Data/Metadata.csv', index_col=0)

merged_df = review_df.merge(metadata_df[['category', 'description', 'title', 'feature', 'asin']], on='asin')
merged_df.drop(['overall', 'vote', 'verified', 'reviewTime', 'reviewerName', 'unixReviewTime', 'image'], axis=1, inplace=True)
merged_df.drop_duplicates(inplace=True)

asin_reviews = {}
for asin, reviews in tqdm(merged_df.groupby('asin')[['reviewerID', 'reviewText', 'summary']]):
    review_list = []
    for index, row in reviews.iterrows():
        review_ID = row['reviewerID']
        review_text = row['reviewText']
        summary = row['summary']
        review_list.append(f"{review_ID} {review_text} {summary}")
    asin_reviews[asin] = review_list


In [60]:
review_key_mapping = {}
for ind, row in tqdm(merged_df.iterrows()):
    unique_id = row['asin'] + '_' + row['reviewerID']
    review_key_mapping[unique_id] = [row['reviewText'], row['summary']]

with open('../Output/Review_key_mapping.pickle', 'wb') as file:
    pickle.dump(review_key_mapping, file, protocol=pickle.HIGHEST_PROTOCOL)


98582it [00:05, 16439.71it/s]


### Perform Sentiment Analysis for each aspect on the entire review dataset

In [17]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from transformers import pipeline
import torch

In [18]:

# Load Aspect-Based Sentiment Analysis model
absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification \
  .from_pretrained("yangheng/deberta-v3-base-absa-v1.1").cuda()

# Load a traditional Sentiment Analysis model
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path,
                          tokenizer=sentiment_model_path)

hardcoded_dict = {'quality' : {'quality', 'material', 'durable', 'high-quality', 'low-quality', 'light', 'thin', 'thick', 'soft', 'made', 
            'luxurious', 'luxury', 'fading', 'wrinkle', 'matte', 'High-quality','Well-made','Durable','Sturdy','Strong','Solid', 'construction','craftsmanship','detail', 'detailed', 'fine', 'Luxurious','Premium','Long-lasting','Robust','Substantial','Heavy-duty','Resilient','Dependable','Superior','Top-notch','Flimsy','Cheaply', 'made','make', 'Poor','Shoddy', 'weak', 'Unreliable','Inferior'},                    
'price' : {'price', 'pricy', 'value', 'value-for-money', 'money', 'charge', 'charging', 'amount', 'payment',  
           'rate', 'worth', 'total', 'sum', 'fare', 'expense', 'valuation', 'valuable',  'estimate', 'expense', 
           'expenses', 'expenditure', 'cost', 'costly', 'Affordable','Inexpensive','Budget-friendly','budget','deal','Reasonable','Cheap','Economical','Cost-effective','Expensive','Overpriced','Too pricey','Steep','Costly','Outrageous','Pricey','Investment piece','Bargain','Discounted','discount','sale','Clearance','Marked','fair' }, 
      
'fit' : {'fit', 'fitting', 'size', 'oversize', 'oversized', 'tight', 'loose', 'small', 'big', 'medium', 'snug', 
         'stretch', 'stretchy', 'baggy', 'perfect', 'perfect-fit', 'supportive', 'support', 'supports', 'large', 'Cropped',
'Long','Short','Petite','Regular','Tall','Wide','Narrow','Flattering','Unflattering','Bulky','Clingy','Flowy','Shapeless','Figure-hugging','Bodycon','Relaxed'}, 
'comfort' :
{'comfort', 'cozy', 'comfortable', 'comforter', 'ease', 'comfortableness', 'comfortability', 'comfy',
'Comfortable','Soft','Cozy','Breathable','Lightweight','Stretchy','Snug','Roomy','Supportive','Cushiony','Comfy','Relaxing','Easy to wear','Non-restrictive','Plush','Smooth','Well-fitting','Comfortable fit','Easy to move in','pinching','rubbing','chafing','digging','squeezing','tightness','slipping','discomfort','irritation','itching'} }

aspect_dict = {}
for aspect, keys in hardcoded_dict.items():
    synonyms = [key.lower() for key in keys]
    aspect_dict[aspect] = synonyms

def aspect_based_sentiment_analysis_v1(asin):
    
    global aspect_dict, asin_reviews
    aspect_words = aspect_dict
    aspects = list(aspect_dict.keys())
    # Get the first ASIN in the input dictionary
    reviews_data = asin_reviews[asin]

    # Perform aspect-based sentiment analysis for the chosen aspect and each sentence in the reviews
    scores = []
    for review in reviews_data:
        review_id, review_text = review.split(" ", 1)
        review_scores = []
        for aspect in aspects:
            # TODO : Preprocess review text 
            if any(word in review_text.lower() for word in aspect_words[aspect]):
                # Concatenate the input text with the aspect text and add special tokens
                input_text = f"[CLS] {review_text} [SEP] {aspect} [SEP]"
                # Tokenize the input text using the ABSA tokenizer
                inputs = absa_tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to("cuda")
                # Pass the input through the ABSA model to get the aspect-based sentiment probabilities
                with torch.no_grad():
                    outputs = absa_model(**inputs)
                    probs = F.softmax(outputs.logits, dim=1)
                    probs = probs.cpu().detach().numpy()[0]
                    probs = [float(f"{p:.5f}") for p in probs]
                review_scores.append((aspect, probs))
            else:
                review_scores.append((aspect, [0.0, 0.0, 0.0]))
        scores.append((review_id, review_scores))

    # Return the aspect-based sentiment scores
    return {asin: scores}

In [19]:
results = []
for asin in tqdm(list(asin_reviews.keys())):
    results.append(aspect_based_sentiment_analysis_v1(asin= asin))

import pickle
with open('../Output/Review_sentiments.picke', 'wb') as file:
    pickle.dump(results, file, protocol = pickle.HIGHEST_PROTOCOL)

100%|██████████| 5543/5543 [1:17:57<00:00,  1.19it/s]


### Make a dictionary to idetify for each produc the top 2 positive and negative review

In [49]:
top_review_dict = {}

for result in tqdm(results):

    asin = list(result.keys())[0]
    reviews = list(result.values())[0]

    aspect_dict = {'quality' : {'positive' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}, 
                            'negative' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}},
               'price'  : {'positive' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}, 
                            'negative' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}},
               'fit' :    {'positive' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}, 
                            'negative' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}},
               'comfort' : {'positive' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}, 
                            'negative' : {'score_1' :0, 'item_1':None, 'score_2':0, 'item_2':None}}}
    
    for reviewer_id, aspect_details in reviews:
        unique_id = asin+'_'+reviewer_id
        for aspect,aspect_values in aspect_details:
            if aspect_values[0]!=0:
                # Negative sentiment
                if aspect_values[0]>0.5:
                    if aspect_dict[aspect]['negative']['score_1'] == 0:
                        aspect_dict[aspect]['negative']['score_1'] = aspect_values[0]
                        aspect_dict[aspect]['negative']['item_1'] = unique_id
                    elif aspect_dict[aspect]['negative']['score_1'] < aspect_values[0]:
                        aspect_dict[aspect]['negative']['score_2'] = aspect_dict[aspect]['negative']['score_1']
                        aspect_dict[aspect]['negative']['item_2'] = aspect_dict[aspect]['negative']['item_1']
                        aspect_dict[aspect]['negative']['score_1'] = aspect_values[0]
                        aspect_dict[aspect]['negative']['item_1'] = unique_id
                    elif aspect_dict[aspect]['negative']['score_1'] > aspect_values[0]:
                        if aspect_dict[aspect]['negative']['score_2'] < aspect_values[0]:
                            aspect_dict[aspect]['negative']['score_2'] = aspect_values[0]
                            aspect_dict[aspect]['negative']['item_2'] = unique_id
                # Positive sentiment
                elif aspect_values[2]>0.5:
                    if aspect_dict[aspect]['positive']['score_1'] == 0:
                        aspect_dict[aspect]['positive']['score_1'] = aspect_values[2]
                        aspect_dict[aspect]['positive']['item_1'] = unique_id
                    elif aspect_dict[aspect]['positive']['score_1'] < aspect_values[2]:
                        aspect_dict[aspect]['positive']['score_2'] = aspect_dict[aspect]['positive']['score_1']
                        aspect_dict[aspect]['positive']['item_2'] = aspect_dict[aspect]['positive']['item_1']
                        aspect_dict[aspect]['positive']['score_1'] = aspect_values[2]
                        aspect_dict[aspect]['positive']['item_1'] = unique_id
                    elif aspect_dict[aspect]['positive']['score_1'] > aspect_values[2]:
                        if aspect_dict[aspect]['positive']['score_2'] < aspect_values[2]:
                            aspect_dict[aspect]['positive']['score_2'] = aspect_values[2]
                            aspect_dict[aspect]['positive']['item_2'] = unique_id
    
    top_review_dict[asin] = aspect_dict

with open('../Output/Top_reviews.pickle', 'wb') as file:
    pickle.dump(top_review_dict, file, protocol=pickle.HIGHEST_PROTOCOL) 

100%|██████████| 5543/5543 [00:00<00:00, 21047.98it/s]


### Create Review dict for Ease of Reading files

In [71]:
review_dict_for_Swati = {}

# def get_data_from_review_key_mapping(unique_id):

#     if review_key_mapping[unique_id] 

for asin, aspect_dict in top_review_dict.items():

    # Quality
    list_of_reviews = []
    if aspect_dict['quality']['positive']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['quality']['positive']['item_1']])

    if aspect_dict['quality']['positive']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['quality']['positive']['item_2']])

    if aspect_dict['quality']['negative']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['quality']['negative']['item_1']])

    if aspect_dict['quality']['negative']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['quality']['negative']['item_2']])

    # comfort
    if aspect_dict['comfort']['positive']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['comfort']['positive']['item_1']])

    if aspect_dict['comfort']['positive']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['comfort']['positive']['item_2']])

    if aspect_dict['comfort']['negative']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['comfort']['negative']['item_1']])

    if aspect_dict['comfort']['negative']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['comfort']['negative']['item_2']])

    # price
    if aspect_dict['price']['positive']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['price']['positive']['item_1']])

    if aspect_dict['price']['positive']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['price']['positive']['item_2']])

    if aspect_dict['price']['negative']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['price']['negative']['item_1']])

    if aspect_dict['price']['negative']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['price']['negative']['item_2']])

    # fit
    if aspect_dict['fit']['positive']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['fit']['positive']['item_1']])

    if aspect_dict['fit']['positive']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['fit']['positive']['item_2']])

    if aspect_dict['fit']['negative']['item_1'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['fit']['negative']['item_1']])

    if aspect_dict['fit']['negative']['item_2'] is None:
        list_of_reviews.append('')
    else:
        list_of_reviews.append(review_key_mapping[aspect_dict['fit']['negative']['item_2']])


    review_dict_for_Swati[asin] = list_of_reviews

with open('../Output/Review_dict_for_Swati.pickle', 'wb') as file:
    pickle.dump(review_dict_for_Swati, file, protocol=pickle.HIGHEST_PROTOCOL)