# Sentiment Analysis using Transformers(BERT)

In [None]:
# Import necessary libraries
import numpy as np
import re
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import emoji

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set intial variables and constants
%config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def strip_emoji(text):
    return emoji.replace_emoji(text, replace=' ')

In [None]:
# Read the CSV file containing processed sampled tweets
df = pd.read_csv('cleanprocessed_sampled_tweets.csv')

# Convert text to lowercase and replace non-alphanumeric characters and the word 'url' with a space
df["text"] = df["text"].str.lower().str.replace("([^0-9A-Za-z \t])|\burl\b", " ", case=False, regex=True)

# Drop duplicate entries based on the 'text' column
df = df.drop_duplicates("text")

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    t = row["text"]  # Get the original text
    cleaned_text = strip_emoji(t)  # Remove emojis
    # Update the processed text back to the original 'text' column
    df.at[index, "text"] = cleaned_text

# Print the number of unique reviews
print(df.shape[0])  # Print the number of unique tweets

In [None]:
df.info()

In [None]:
import scipy.stats as stats

# Draw a count plot for the happiness scores
sns.countplot(x=df.happiness, palette=HAPPY_COLORS_PALETTE)

# Calculate parameters for the normal distribution
mean = 4
height = 200
lower_bound = 0
upper_bound = 8
std = (upper_bound - mean) / 2.576  # Calculate standard deviation

# Generate x values for the normal distribution
x = np.linspace(mean - 4*std, mean + 4*std, 1000)

# Calculate the y values for the normal distribution
y = stats.norm.pdf(x, mean, std)

# Adjust y values to reach the specified height
y = y / np.max(y) * height

# Plot the normal distribution curve
plt.plot(x, y, color='red', label='Fitted Curve', lw=2)

# Add legend
plt.legend()

# Show the graph
plt.xlabel('Sentiment score')

In [None]:
# Function to convert Happiness to sentiment
def to_sentiment(rating):
    
    rating = int(rating)
    
    # Convert to class
    if rating <= 4:
        return 0
    elif rating == 5:
        return 1
    else:
        return 2

# Apply to the dataset 
df['sentiment'] = df.happiness.apply(to_sentiment)

In [None]:
# Plot the distribution
class_names = ['negative', 'neutral', 'positive']
ax = sns.countplot(x=df.sentiment, palette=HAPPY_COLORS_PALETTE)
plt.xlabel('tweet sentiment')
ax.set_xticklabels(class_names)

In [None]:
# Set the model name
MODEL_NAME = 'bert-base-uncased'

# Build a BERT based tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Some of the common BERT tokens
print(tokenizer.sep_token, tokenizer.sep_token_id) # marker for ending of a sentence
print(tokenizer.cls_token, tokenizer.cls_token_id) # start of each sentence, so BERT knows we’re doing classification
print(tokenizer.pad_token, tokenizer.pad_token_id) # special token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id) # tokens not found in training set 

In [None]:
# Store length of each tweet 
token_lens = []

# Iterate through the content slide
for txt in df.text:
    tokens = tokenizer.encode(txt, max_length=512)
    token_lens.append(len(tokens))

In [None]:
# plot the distribution of tweet lengths 
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count')

In [None]:
MAX_LEN = 170

In [None]:
class SentiTweetsDataset(Dataset):
    # Constructor Function 
    def __init__(self, tweets, targets, tokenizer, max_len):
        self.tweets = tweets
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    # Length magic method
    def __len__(self):
        return len(self.tweets)
    
    # get item magic method
    def __getitem__(self, item):
        tweet = str(self.tweets[item])
        target = self.targets[item]
        
        # Encoded format to be returned 
        encoding = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'tweet_text': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.long)
        }

In [None]:
#Create a 80% train data and 10% test and 10% validation data
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

print(df_train.shape, df_val.shape, df_test.shape)

In [None]:
#Create a dataloader to release data in batches.
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = SentiTweetsDataset(
        tweets=df.text.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len)
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0
    )

In [None]:
# Create train, test and val data loaders
BATCH_SIZE = 32
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
# Examples 
data = next(iter(train_data_loader))
print(data.keys())

print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

## Sentiment Classification with BERT and Hugging Face

In [None]:
# Load the basic BERT model 
bert_model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
# Build the Sentiment Classifier class 
class SentimentClassifier(nn.Module):
    
    # Constructor class 
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)  # Load the pre-trained BERT model
        self.drop = nn.Dropout(p=0.3)  # Dropout layer to prevent overfitting
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)  # Output layer

    # Forward propagation method
    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask
        )
        pooled_output = outputs[1]  # Get the pooled output

        # Debug: Print the type of pooled_output
        # print("Type of pooled_output:", type(pooled_output))
        
        # Add a dropout layer 
        output = self.drop(pooled_output)
        return self.out(output)  # Return the output from the final layer

In [None]:
# Notice if output is tensor

In [None]:
# Instantiate the model and move to classifier
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [None]:
# Number of hidden units
print(bert_model.config.hidden_size)

In [None]:
# Number of iterations 
EPOCHS = 15

# Optimizer Adam 
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps)

# Set the loss function 
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask)
        
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)
        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())
        
        # Backward prop
        loss.backward()
        
        # Gradient Descent
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
# Write a function to evaluate model performance
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)
            
            # Get model ouptuts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, targets)
            
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
#Write the training Loop and store the best training state.

# Initialize history to store training and validation metrics
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    
    # Show details 
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    print("-" * 10)
    
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train))
    
    print(f"Train loss {train_loss} accuracy {train_acc}")
    
    # Get model performance (accuracy and loss)
    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(df_val))
    
    print(f"Val   loss {val_loss} accuracy {val_acc}")
    print()
    
    history['train_acc'].append(train_acc.item())
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc.item())
    history['val_loss'].append(val_loss)
    
    '''# If we beat prev performance
    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc'''

## Model Evaluation

In [None]:
# Plot training and validation accuracy
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

# Graph chars
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
# Model Evaluation¶
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()

    tweet_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["tweet_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outouts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            tweet_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return tweet_texts, predictions, prediction_probs, real_values

In [None]:
y_tweet_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)
print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)

## Predicting on raw text

In [None]:
test_text = "Elon supports Trump"

encoded_test = tokenizer.encode_plus(
    test_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)

input_ids = encoded_test['input_ids'].to(device)
attention_mask = encoded_test['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Test text: {test_text}')
print(f'Sentiment  : {class_names[prediction]}')

In [None]:
# Read the data to be classified
data = pd.read_csv('classified_data_bayes.csv')

# Clean the 'text' column
data["text"] = data["text"].str.lower().str.replace(r"([^0-9A-Za-z \t])|\burl\b", "", case=False, regex=True)

# Iterate over each row, removing emojis
for index, row in data.iterrows():
    t = row["text"]  # Get the original text
    cleaned_text = strip_emoji(t)  # Remove emojis
    # Update the processed text back to the original 'text' column
    data.at[index, "text"] = cleaned_text
    
# Define a function for classification
def classify_text(text):
    encoded_test = tokenizer.encode_plus(
        text,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_test['input_ids'].to(device)
    attention_mask = encoded_test['attention_mask'].to(device)

    with torch.no_grad():  # Disable gradient calculation
        output = model(input_ids, attention_mask)

    _, prediction = torch.max(output, dim=1)  # Get the predicted class
    return class_names[prediction.item()]

# Classify each row in the 'text' column and store results in the 'sentiment' column
data['sentiment'] = data['text'].apply(classify_text)

# Save the results to a new CSV file
data.to_csv('classified_data_sentiment_.csv', index=False, encoding='utf-8-sig')

print("Classification completed and saved as 'classified_sentiment_data.csv'")

# Word shift Graph lexicon

In [None]:
import pandas as pd
import numpy as np
import collections
import itertools
import nltk
from nltk.corpus import stopwords
import re
from collections.abc import Mapping
import shifterator as sh

import matplotlib.pyplot as plt
import seaborn as sns

#sns.set(font_scale=1.5)
#sns.set_style("whitegrid")

In [None]:
def remove_punctuation(txt:str):

    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

def clean_text(txt:str) -> {}:
    """Removes punctuation, changes to lowercase, removes
        stopwords, removes "animal" and "crossing", and
        calculates word frequencies (as counts).

    Parameters
    ----------
    txt : string
        A text string that you want to clean.

    Returns
    -------
    Words and frequency counts
    """
    
    tmp = [remove_punctuation(t) for t in txt]
    tmp = [t.lower().split() for t in tmp]
    
    tmp = [[w for w in t if not w in stop_words]
              for t in tmp]
#     tmp = [[w for w in t if not w in ['animal', 'crossing']]
#                      for t in tmp]
    
    tmp = list(itertools.chain(*tmp))
    tmp = collections.Counter(tmp)
        
    return tmp

In [None]:
df = pd.read_csv('classified_data_sentiment_.csv')

In [None]:
print(df.drop_duplicates("text").shape[0])

In [None]:
df["text"] = df["text"].str.lower().str.replace("([^0-9A-Za-z \t])|\burl\b", "", case=False, regex=True)

In [None]:
df = df.drop_duplicates("text")
print(df.shape[0]) # 6 duplicate reviews. 

In [None]:
df.sentiment.hist();

In [None]:
df_neg = df[df['sentiment'] == 'negative']
df_pos = df[df['sentiment'] == 'positive']

In [None]:
texts = df['text'].tolist()
texts_neg = df_neg['text'].tolist()
texts_pos = df_pos['text'].tolist()

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.update(['puncexcl', 'puncques', "hashelection2024"])

In [None]:
# Clean up the review texts
clean_texts = clean_text(texts)
clean_texts_neg = clean_text(texts_neg)
clean_texts_pos = clean_text(texts_pos)

## Plotting

In [None]:
sentiment_shift = sh.WeightedAvgShift(type2freq_1 = clean_texts,
                                      type2freq_2 = clean_texts_neg,
                                      type2score_1 = 'labMT_English',
                                      type2score_2 = 'labMT_English',
                                      stop_lens=[(4,6)])
sentiment_shift.get_shift_graph(detailed=True,
                                top_n=30,
                                system_names=['all tweets', 'negative'])

In [None]:
sentiment_shift = sh.WeightedAvgShift(type2freq_1 = clean_texts_neg,
                                      type2freq_2 = clean_texts_pos,
                                      type2score_1 = 'labMT_English',
                                      type2score_2 = 'labMT_English',
                                      stop_lens=[(4,6)])
sentiment_shift.get_shift_graph(detailed=True,
                                system_names=['negative', 'positive'])

In [None]:
# political

In [None]:
df_t = df[df['support'] == 'Trump']
df_o = df[(df['support'] == 'Others') | (df['support'] == 'Harris')]
texts = df['text'].tolist()
texts_t = df_t['text'].tolist()
texts_o = df_o['text'].tolist()
# Clean up the review texts
clean_texts_t = clean_text(texts_t)
clean_texts_o = clean_text(texts_o)
df['support'].value_counts()

In [None]:
sentiment_shift = sh.WeightedAvgShift(type2freq_1 = clean_texts,
                                      type2freq_2 = clean_texts_o,
                                      type2score_1 = 'labMT_English',
                                      type2score_2 = 'labMT_English',
                                      stop_lens=[(4,6)])
sentiment_shift.get_shift_graph(detailed=1,
                                top_n=30,
                                system_names=['All text', 'Others'])

In [None]:
sentiment_shift = sh.WeightedAvgShift(type2freq_1 = clean_texts,
                                      type2freq_2 = clean_texts_t,
                                      type2score_1 = 'labMT_English',
                                      type2score_2 = 'labMT_English',
                                      stop_lens=[(4,6)])
sentiment_shift.get_shift_graph(detailed=1,
                                top_n=30,
                                system_names=['All text', 'Trump'])