<a href="https://colab.research.google.com/github/priyanka-ingale/unstructured-intelligence/blob/main/MSIS521_S3_Introduction_to_Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction to BERT

## Setting up the environment

In [1]:
!pip install transformers torch --quiet

In [2]:
import pandas as pd
from transformers import pipeline # used to import the Hugging Face transformers library pre-trained models

KeyboardInterrupt: 

## A pre-trained sentiment pipeline

In [None]:
# Load the BERT-based sentiment analysis pipeline
sentiment_model = pipeline(task="sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
# quick test
sentiment_model("This phone case is simply incredible")

### BERT sentiment analysis on example text

In [None]:
reviews = [
    "Shipping was super fast and customer support was helpful.",
    "The product stopped working after only two weeks. Very disappointed.",
    "It's okay, not great but not terrible either."
]

In [None]:
results = sentiment_model(reviews)
for text, result in zip(reviews, results):
  print(f"TEXT: {text}\nPREDICTION: {result}\n")


### BERT sentiment analysis of online reviews

In [None]:
import pandas as pd
df = pd.read_csv("videogame_reviews.csv") # amzn_reviews.csv has a column 'review_text’
df.head()

In [None]:
# For a quick demo, let’s sample a subset
sample_df = df.sample(50, random_state=12)

sample_df["bert_sentiment"] = sample_df["review_text"].apply(
    lambda x: sentiment_model(str(x))[0]["label"]
)

sample_df.head()

### Aggregating sentiment results

In [None]:
sentiment_counts = sample_df["bert_sentiment"].value_counts().sort_index()
print(sentiment_counts)

# Convert counts to percentages
sentiment_pct = sentiment_counts / sentiment_counts.sum() * 100
print(sentiment_pct)

### Visualizing sentiment distribution

In [None]:
import matplotlib.pyplot as plt

sentiment_counts.plot(kind="pie")
plt.xlabel("Predicted Sentiment (Stars)")
plt.title("Review Sentiment Distribution (BERT)")
plt.show()

## BERT for embedding and clustering

### BERT as an embedding model

In [None]:
# Import needed libraries
from transformers import AutoTokenizer, AutoModel

# Set up the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define a function to do CLS (classification) embedding using the tokenizer and model
def get_cls_embedding(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )
    with torch.no_grad():
        outputs = model(**inputs)
    # CLS token embedding is at position 0
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

### Clustering reviews based on BERT embeddings

In [None]:
from sklearn.cluster import KMeans
import torch
import numpy as np

# Take the small subset of reviews we defined earlier
texts = sample_df['review_text'].tolist()

# Do embedding for all the reviews and stack them
embeddings = np.vstack([get_cls_embedding(t) for t in texts])

# Do Kmeans clustering based on the embeddings
kmeans = KMeans(n_clusters=3, random_state=12)
labels = kmeans.fit_predict(embeddings)

for text, label in zip(texts, labels):
    print(f"Cluster {label}: {text[:80]}...") # print the first 80 characters of texts and their cluster

## Continuous sentiment analysis (between -1 and 1) using BERT

In [None]:
from transformers import pipeline # we need the pipeline like before

# BERT-family sentiment model (binary: POSITIVE / NEGATIVE)
sentiment_model = pipeline(
    task="sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

# The model above returns postivie, negative labels along with a confidence. We turn it into a score using this function:
def bert_signed_score(text):
    out = sentiment_model(text, truncation=True)[0]
    label = out["label"].upper()      # 'POSITIVE' or 'NEGATIVE'
    score = out["score"]              # confidence in predicted label (0 to 1)
    if label.startswith("NEG"):
        return -score                 # map NEGATIVE to [-1, 0)
    else:
        return score                  # map POSITIVE to (0, 1]

text = "Shipping was super fast and the support team was helpful."
print(text)
print("Sentiment score:", bert_signed_score(text))  # in [-1, +1]

### Applying the continuous sentiment scores to reviews

In [None]:
# Apply BERT-based continuous sentiment to a DataFrame of reviews
def sentiment_score(text):
    return bert_signed_score(str(text))

sample_df["sentiment_score"] = sample_df["review_text"].apply(sentiment_score)

sample_df.head()

### Aggregating continuous scores

In [None]:
# Aggregate by time, product, or campaign (example: by 'product_id')
product_sentiment = (
	sample_df
    .groupby("product_id")["sentiment_score"]
	.mean()
	.sort_values()
	)
print(product_sentiment.head(10)) # since we randomly drew 50 reviews, there are not that many reviews per product_ID

In [None]:
# Or simply look at the distribution overall
import matplotlib.pyplot as plt

sample_df["sentiment_score"].hist(bins=10)
plt.xlabel("Sentiment score (-1 to +1)")
plt.ylabel("Number of reviews")
plt.title("Distribution of Continuous Sentiment Scores (BERT-based)")
plt.show()