## Preprocessing Functions

In [1]:
# Import Libraries
import pandas as pd
import re
import string

# Define Text Cleaning Function
def clean_text(text):
    if pd.isna(text):
        return ""

    # remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # remove HTML
    text = re.sub(r"<.*?>", "", text)

    # remove numbers
    text = re.sub(r"\d+", "", text)

    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # lowercase & trim
    return text.lower().strip()

# Define Sentiment Mapping Function
def map_sentiment(score):
    try:
        score = float(score)
        if score <= 2:
            return "negative"
        elif score == 3:
            return "neutral"
        else:
            return "positive"
    except:
        return None


## Load & Preprocess Each Dataset

In [3]:
# Load Amazon Review Datasets
df1 = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv")
df2 = pd.read_csv("Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv")

# Preprocess each dataset
def preprocess_amazon_df(df, source_name):
    df_clean = df[['reviews.text', 'reviews.rating']].copy()
    df_clean.columns = ['review_text', 'rating']
    df_clean['clean_text'] = df_clean['review_text'].apply(clean_text)
    df_clean['sentiment'] = df_clean['rating'].apply(map_sentiment)
    df_clean['source'] = source_name
    return df_clean

df1_cleaned = preprocess_amazon_df(df1, "Amazon_Products")
df2_cleaned = preprocess_amazon_df(df2, "Amazon_Products_May19")


## Combine & Filter Cleaned Data

In [4]:
# Combine datasets
combined_df = pd.concat([df1_cleaned, df2_cleaned], ignore_index=True)

# Drop missing or empty values
combined_df = combined_df.dropna(subset=['sentiment'])
combined_df = combined_df[combined_df['clean_text'].str.strip() != ""]

# Preview
combined_df.head()


Unnamed: 0,review_text,rating,clean_text,sentiment,source
0,I thought it would be as big as small paper bu...,3,i thought it would be as big as small paper bu...,neutral,Amazon_Products
1,This kindle is light and easy to use especiall...,5,this kindle is light and easy to use especiall...,positive,Amazon_Products
2,Didnt know how much i'd use a kindle so went f...,4,didnt know how much id use a kindle so went fo...,positive,Amazon_Products
3,I am 100 happy with my purchase. I caught it o...,5,i am happy with my purchase i caught it on sa...,positive,Amazon_Products
4,Solid entry level Kindle. Great for kids. Gift...,5,solid entry level kindle great for kids gifted...,positive,Amazon_Products


In [5]:
# Save cleaned dataset for use in BERT and aspect extraction
combined_df.to_csv("cleaned_amazon_reviews.csv", index=False)

## Aspect Extraction using spaCy

In [6]:

!pip install spacy

# Load English NLP model
import spacy
nlp = spacy.load("en_core_web_sm")




In [7]:
# Extract candidate aspects using noun chunks
def extract_aspects(text):
    doc = nlp(text)
    return [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) < 4]


In [8]:
# Apply to a sample first (e.g., 2000 rows for speed)
aspect_df = combined_df[['clean_text', 'sentiment']].copy().head(2000)

# Extract aspects
aspect_df['aspects'] = aspect_df['clean_text'].apply(extract_aspects)

# Preview results
aspect_df[['clean_text', 'aspects', 'sentiment']].head(10)

Unnamed: 0,clean_text,aspects,sentiment
0,i thought it would be as big as small paper bu...,"[i, it, small paper, my palm, i, it, it, regul...",neutral
1,this kindle is light and easy to use especiall...,"[this kindle, the beach]",positive
2,didnt know how much id use a kindle so went fo...,"[id, a kindle, the lower end, i, it]",positive
3,i am happy with my purchase i caught it on sa...,"[i, my purchase, i, it, sale, i, i, who, pages...",positive
4,solid entry level kindle great for kids gifted...,"[kids, a kid, my friend, they, it, their ipads...",positive
5,this make an excellent ebook reader dont expec...,"[this, this device, basic ebooks, the good thi...",positive
6,i ordered this for my daughter as i have the b...,"[i, this, my daughter, i, the black paperwhite...",positive
7,i bought my kindle about months ago and the b...,"[i, my kindle, the battery]",positive
8,amazon kindle is always the best ebook upgrade...,"[amazon kindle, the best ebook, every new model]",positive
9,its beyond my expectation and it can even show...,"[its, my expectation, it, music score]",positive


In [9]:
from collections import Counter

all_aspects = [aspect.lower() for sublist in aspect_df['aspects'] for aspect in sublist]
aspect_counts = Counter(all_aspects)

# Show top 20 most frequent aspects
aspect_counts.most_common(20)


[('i', 1866),
 ('it', 1810),
 ('this', 486),
 ('you', 456),
 ('we', 342),
 ('she', 219),
 ('that', 192),
 ('they', 154),
 ('me', 150),
 ('music', 137),
 ('he', 122),
 ('the echo', 114),
 ('alexa', 108),
 ('them', 99),
 ('the screen', 96),
 ('the echo show', 86),
 ('the show', 85),
 ('what', 79),
 ('this product', 70),
 ('all', 61)]

## Aspect-Based Sentiment Classification using BERT

In [10]:
# Create (aspect, sentence, sentiment) triples
aspect_sentiment_pairs = []

for _, row in aspect_df.iterrows():
    for aspect in row['aspects']:
        aspect_sentiment_pairs.append({
            'aspect': aspect.lower().strip(),
            'text': row['clean_text'],
            'sentiment': row['sentiment']
        })

# Convert to DataFrame
pairs_df = pd.DataFrame(aspect_sentiment_pairs)

# Optional preview
pairs_df.head(10)


Unnamed: 0,aspect,text,sentiment
0,i,i thought it would be as big as small paper bu...,neutral
1,it,i thought it would be as big as small paper bu...,neutral
2,small paper,i thought it would be as big as small paper bu...,neutral
3,my palm,i thought it would be as big as small paper bu...,neutral
4,i,i thought it would be as big as small paper bu...,neutral
5,it,i thought it would be as big as small paper bu...,neutral
6,it,i thought it would be as big as small paper bu...,neutral
7,regular kindle,i thought it would be as big as small paper bu...,neutral
8,a paperwhite,i thought it would be as big as small paper bu...,neutral
9,this kindle,this kindle is light and easy to use especiall...,positive


In [11]:
!pip install transformers

from transformers import BertTokenizer

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode input: aspect [SEP] review_text
def encode_inputs(row):
    return tokenizer(
        row['aspect'],
        row['text'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )

# Optional: test on one row
encoded_sample = encode_inputs(pairs_df.iloc[0])
encoded_sample




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  1045,   102,  1045,  2245,  2009,  2052,  2022,  2004,  2502,
          2004,  2235,  3259,  2021,  2735,  2041,  2000,  2022,  2074,  2066,
          2026,  5340,  1045,  2228,  2009,  2003,  2205,  2235,  2000,  3191,
          2006,  2009,  2025,  2200,  6625,  2004,  3180,  2785,  2571,  2052,
          5791, 16755,  1037,  3259,  2860, 16584,  2063,  2612,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

## Model Training with BERT (Huggingface + PyTorch)

In [12]:
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels (positive = 2, neutral = 1, negative = 0)
label_encoder = LabelEncoder()
pairs_df['label'] = label_encoder.fit_transform(pairs_df['sentiment'])

# View mapping
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))


{'negative': np.int64(0), 'neutral': np.int64(1), 'positive': np.int64(2)}


In [19]:
# Use only first 1000 examples for quick prototype of the project
prototype_df = pairs_df.sample(n=1000, random_state=42).reset_index(drop=True)


In [20]:
from torch.utils.data import Dataset

class PrototypeAspectDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=64):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        encoded = tokenizer(
            row['aspect'], row['text'],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(row['label'], dtype=torch.long)
        }


In [21]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(prototype_df, test_size=0.2, random_state=42)

train_dataset = PrototypeAspectDataset(train_df, tokenizer)
test_dataset = PrototypeAspectDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


In [23]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW
import torch
from tqdm import tqdm

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop for 1 epoch
def train(model, dataloader):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Avg training loss: {total_loss / len(dataloader):.4f}")

train(model, train_loader)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  1%|          | 1/100 [00:14<24:29, 14.84s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
  2%|▏         | 2/100 [00:20<15:14,  9.33s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned 

Avg training loss: 0.2966





## Model Evaluation

In [24]:
from sklearn.metrics import classification_report

def evaluate(model, dataloader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            true_labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            preds.extend(predictions.cpu().numpy())
            labels.extend(true_labels.cpu().numpy())

    return preds, labels

preds, true_labels = evaluate(model, test_loader)

# Decode class names from encoded labels
decoded_preds = label_encoder.inverse_transform(preds)
decoded_labels = label_encoder.inverse_transform(true_labels)

print(classification_report(decoded_labels, decoded_preds))


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         5
     neutral       0.00      0.00      0.00        10
    positive       0.93      1.00      0.96       185

    accuracy                           0.93       200
   macro avg       0.31      0.33      0.32       200
weighted avg       0.86      0.93      0.89       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## SHAP Integration for Huggingface BERT

In [25]:
!pip install shap




In [32]:
import shap
import numpy as np

# Wrap BERT model for SHAP (text classifier interface)
class BERTAspectClassifierWrapper:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, inputs):
        encoded = self.tokenizer(
            inputs,
            padding='max_length',
            truncation=True,
            max_length=64,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = self.model(**encoded)
            logits = outputs.logits
            probs = torch.nn.functional.softmax(logits, dim=-1)

        return probs.cpu().numpy()


In [33]:
# Load a small sample for testing
sample_texts = [
    f"{row['aspect']} [SEP] {row['text']}"
    for _, row in prototype_df.sample(10, random_state=42).iterrows()
]

# Create explainer
wrapped_model = BERTAspectClassifierWrapper(model, tokenizer)
explainer = shap.Explainer(wrapped_model, sample_texts, output_names=label_encoder.classes_)


In [35]:
from transformers import TextClassificationPipeline

# Create pipeline using aspect + review text
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)


Device set to use cpu


In [36]:
row = prototype_df.sample(1).iloc[0]
formatted_input = f"{row['aspect']} [SEP] {row['text']}"


In [39]:
explainer = shap.Explainer(pipe, shap.maskers.Text(tokenizer))
shap_values = explainer([formatted_input])

# Show SHAP text plot
shap.plots.text(shap_values[0])


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 2it [02:40, 160.33s/it]              


In [41]:
# Select 2 random samples from the test set
samples = prototype_df.sample(2, random_state=42)

# Combine aspect and review text
inputs = [f"{row['aspect']} [SEP] {row['text']}" for _, row in samples.iterrows()]

# Generate SHAP values for batch of texts
shap_values = explainer(inputs)

# Display one as an example
shap.plots.text(shap_values[0])


  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer:  50%|█████     | 1/2 [00:00<?, ?it/s]

  0%|          | 0/498 [00:00<?, ?it/s]

PartitionExplainer explainer: 3it [10:29, 314.67s/it]


In [57]:
# Step 1: Create the pipeline
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    return_all_scores=True,
    device=0 if torch.cuda.is_available() else -1
)

# Step 2: Create SHAP explainer
explainer = shap.Explainer(
    pipe,
    masker=shap.maskers.Text(tokenizer),
    output_names=label_encoder.classes_
)

# Step 3: Select samples for prediction + explanation
results = []
sample_batch = prototype_df.sample(2, random_state=42)

for _, row in sample_batch.iterrows():
    aspect = row['aspect']
    review_text = row['text']
    input_text = f"{aspect} [SEP] {review_text}"

    # Predict sentiment
    prediction_scores = pipe(input_text)[0]
    predicted_label = max(prediction_scores, key=lambda x: x['score'])['label']

    # Run SHAP explanation
    shap_values = explainer([input_text])

    # Multiclass: extract SHAP values for the predicted class only
    class_names = label_encoder.classes_
    predicted_class_idx = [p['label'] for p in prediction_scores].index(predicted_label)
    shap_token_values = shap_values[0].values[:, predicted_class_idx]
    shap_tokens = shap_values[0].data

    # Step 4: Top-K token contributions (keep most important ones)
    top_k = 3
    top_indices = np.argsort(np.abs(shap_token_values))[-top_k:]
    word_contributions = [
        (shap_tokens[i], float(shap_token_values[i]))
        for i in reversed(top_indices)
    ]

    # Store result
    results.append({
        "aspect": aspect,
        "review_text": review_text,
        "predicted_sentiment": predicted_label,
        "top_contributing_tokens": word_contributions
    })

# Step 5: Create final DataFrame
pred_explain_df = pd.DataFrame(results)

# Optional: Save to CSV
pred_explain_df.to_csv("bert_aspect_predictions_with_shap.csv", index=False)

# Preview
pred_explain_df[['aspect', 'predicted_sentiment', 'top_contributing_tokens']].head()

Device set to use cpu


  0%|          | 0/498 [00:00<?, ?it/s]



PartitionExplainer explainer: 2it [01:59, 119.14s/it]              


  0%|          | 0/498 [00:00<?, ?it/s]



PartitionExplainer explainer: 2it [08:38, 518.01s/it]              


Unnamed: 0,aspect,predicted_sentiment,top_contributing_tokens
0,i,LABEL_2,"[( i, 0.029077038168907166), ( help, 0.0172925..."
1,the sound,LABEL_2,"[( [SEP], -0.003798454999923706), ( the, -0.00..."


