## Sentiment Analysis with Transformers
In this notebook, we will fine-tune a pre-trained transformer model for sentiment analysis using a custom dataset of tweets.

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
df = pd.read_csv('./ML Assignment Dataset - Train.csv')
df.rename(columns={
    'tweet_text': 'text',
    'emotion_in_tweet_is_directed_at': 'brand',
    'is_there_an_emotion_directed_at_a_brand_or_product': 'emotion'
}, inplace=True)

# Map the emotion labels to categories
def map_to_categories(label):
    if label in ['Negative emotion', 'negative']:
        return 'negative'
    elif label in ['Positive emotion', 'positive']:
        return 'positive'
    else:
        return 'neutral'
df['emotion'] = df['emotion'].apply(map_to_categories)

In [None]:
df.drop('brand', axis=1, inplace=True)
df = df.dropna(subset=['text'])

In [None]:
df.to_csv('wysa.csv', index=False)

### Run the code from here once csv is saved locally

In [9]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [10]:
# Load CSV data
df = pd.read_csv('./wysa.csv')
df.shape

(8588, 2)

Augmentation of df using synonym replacement

In [11]:
import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')


# Function to augment text data using synonym replacement
def synonym_replacement(text, n):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(synonyms).lemmas()[0].name()
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break

    sentence = ' '.join(new_words)
    return sentence

# Augment the dataset
augmented_texts = []
augmented_labels = []
for _, row in df.iterrows():
    augmented_texts.append(row['text'])
    augmented_labels.append(row['emotion'])
    for _ in range(1):  # Augment each entry once
        aug_text = synonym_replacement(row['text'], n=2)  # Replace up to 2 words
        augmented_texts.append(aug_text)
        augmented_labels.append(row['emotion'])

augmented_df = pd.DataFrame({'text': augmented_texts, 'emotion': augmented_labels})

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
augmented_df.shape

(17176, 2)

In [13]:


# Encode the labels (emotion) into numerical format
label_encoder = LabelEncoder()
augmented_df['label'] = label_encoder.fit_transform(augmented_df['emotion'])

# Splitting the dataset into training and testing sets
train_df, test_df = train_test_split(augmented_df, test_size=0.2)

# Load pre-trained BERT model and tokenizer
bert_model_name = "bert-base-uncased"
model = TFAutoModel.from_pretrained(bert_model_name)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [14]:
# Tokenization and dataset preparation
def tokenize_and_format(df):
    tokenized = tokenizer(list(df['text']), padding=True, truncation=True, max_length=512, return_tensors='tf')
    return tokenized.data, tf.convert_to_tensor(df['label'])

train_data, train_labels = tokenize_and_format(train_df)
test_data, test_labels = tokenize_and_format(test_df)

In [15]:
# Create TensorFlow datasets
BATCH_SIZE = 16
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels)).shuffle(len(train_df)).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_labels)).batch(BATCH_SIZE)

# Define a custom BERT-based classification model
class BERTForClassification(tf.keras.Model):
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

# Compile and train the classifier
num_classes = 3  # Positive, Negative, Neutral
classifier = BERTForClassification(model, num_classes=num_classes)
classifier.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                   metrics=['accuracy'])

classifier.fit(train_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7e868fd22b30>

In [16]:
# Evaluate the model
classifier.evaluate(test_dataset)



[0.3285703957080841, 0.8696158528327942]

In [None]:
classifier.save('./bert_emotion_classifier')



In [24]:
def predict_emotion(text, model, tokenizer, label_encoder):
    # Tokenize the input text
    tokenized_input = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors='tf')

    # Predict
    prediction = model(tokenized_input)

    # Convert the prediction tensor to a numpy array and get the index of the maximum value
    # Ensure the prediction is reshaped into a suitable format if needed
    predicted_label_index = tf.argmax(prediction, axis=1).numpy()[0]

    # Convert the index to the corresponding emotion label
    predicted_label = label_encoder.inverse_transform([predicted_label_index])[0]

    return predicted_label

# Example usage
sample_text = "you are a murderer!"
predicted_emotion = predict_emotion(sample_text, classifier, tokenizer, label_encoder)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: negative
