# Part-1 **Sentiment Analysis using BERT on Twitter US-Airlines Sentiment dataset **

# **RoBERTa-base**

In [61]:
# !pip install transformers 
# !pip install sentencepiece

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from transformers import RobertaTokenizer, XLMRobertaForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re
import nltk
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
# Load the dataset
df = pd.read_csv('Tweets.csv')

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [64]:
# !pip install vaderSentiment
import vaderSentiment

In [65]:
# Define preprocessing functions
def remove_usernames(text):
    return re.sub(r'@[A-Za-z0-9]+', '', text)

def remove_urls(text):
    return re.sub(r'http\S+', '', text)

def preprocess_tweet_text(tweet):
    # Convert to lowercase
    tweet = tweet.lower()
    
    # Remove URLs
    tweet = remove_urls(tweet)
    
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    
    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english') + list(punctuation))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join the tokens into a string
    tweet = ' '.join(tokens)
    
    return tweet

In [66]:
# Apply preprocessing to the 'text' column
df['text'] = df['text'].apply(remove_usernames)
df['text'] = df['text'].apply(preprocess_tweet_text)

In [68]:
# Split the dataset into training, validation, and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['airline_sentiment'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['airline_sentiment'])

val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

# Load pre-trained DistilBERT tokenizer and encode text
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

train_encodings = tokenizer(train_text.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_text.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test_text.tolist(), truncation=True, padding=True)

df['airline_sentiment'] = df['airline_sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)
test_labels = label_encoder.transform(test_labels)

train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(train_labels))

val_dataset = TensorDataset(torch.tensor(val_encodings['input_ids']),
                            torch.tensor(val_encodings['attention_mask']),
                            torch.tensor(val_labels))

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(test_labels))

In [69]:
# Tensorizing the data using data loaders 
def get_data_loaders(train_inputs, train_labels, val_inputs, val_labels, batch_size):
    # Convert data to PyTorch tensors
    train_inputs = torch.tensor(train_inputs)
    train_labels = torch.tensor(train_labels)
    val_inputs = torch.tensor(val_inputs)
    val_labels = torch.tensor(val_labels)
    
    # Create TensorDataset objects
    train_data = TensorDataset(train_inputs, train_labels)
    val_data = TensorDataset(val_inputs, val_labels)
    
    # Create DataLoader objects
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    
    return train_dataloader, val_dataloader

In [70]:
from transformers import RobertaForSequenceClassification

# Define data loaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Load pre-trained roberta-base model
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels = 3)

# Move model to the device
model = model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)
num_warmup_steps = int(len(train_dataloader) * 0.1)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=5)
epochs = 5

# Define cross-entropy loss function
loss_fn = torch.nn.CrossEntropyLoss()

# Define early_stop
early_stop = 3
best_val_loss = float('inf')
best_epoch = 0
for epoch in range(epochs):
    # Training
    model.train()
    train_loss = 0
    train_acc = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_acc += (outputs[1].detach().cpu().numpy().argmax(axis=1) == b_labels.cpu().numpy()).mean()
    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)

    # Evaluation
    model.eval()
    val_loss = 0
    val_acc = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            val_loss += loss.item()
            val_acc += (outputs[1].detach().cpu().numpy().argmax(axis=1) == b_labels.cpu().numpy()).mean()
    val_loss /= len(val_dataloader)
    val_acc /= len(val_dataloader)

    print("Epoch {} - train loss: {:.3f} - train acc: {:.3f} - val loss: {:.3f} - val acc: {:.3f}".format(epoch, train_loss, train_acc, val_loss, val_acc))

    # Save the model
    if val_loss < best_val_loss:
        torch.save(model.state_dict(), 'roberta_sentiment_model.pt')
        best_val_loss = val_loss
        best_epoch = epoch
        print("The model has been saved")

    # Stop training if the validation loss stops improving after certain epochs
    if epoch - best_epoch >= early_stop:
        print("Validation loss has not improved in {} epochs, stopping training".format(early_stop))
        break

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Epoch 0 - train loss: 0.788 - train acc: 0.668 - val loss: 0.742 - val acc: 0.685
The model has been saved
Epoch 1 - train loss: 0.771 - train acc: 0.674 - val loss: 0.742 - val acc: 0.685
Epoch 2 - train loss: 0.772 - train acc: 0.674 - val loss: 0.742 - val acc: 0.685
Epoch 3 - train loss: 0.772 - train acc: 0.672 - val loss: 0.742 - val acc: 0.685
Validation loss has not improved in 3 epochs, stopping training


**Batch size 8** taking about **200 seconds** for each episode to run.

Epoch 0 - train loss: 0.788 - train acc: 0.668 - val loss: 0.742 - val acc: 0.685

Epoch 1 - train loss: 0.771 - train acc: 0.674 - val loss: 0.742 - val acc: 0.685

Epoch 2 - train loss: 0.772 - train acc: 0.674 - val loss: 0.742 - val acc: 0.685

Epoch 3 - train loss: 0.772 - train acc: 0.672 - val loss: 0.742 - val acc: 0.685

**Batch size 16** taking about **200 seconds** for each episode to run.

Epoch 0 - train loss: 0.837 - train acc: 0.606 - val loss: 0.796 - val acc: 0.629

Epoch 1 - train loss: 0.817 - train acc: 0.627 - val loss: 0.796 - val acc: 0.629

Epoch 2 - train loss: 0.817 - train acc: 0.627 - val loss: 0.796 - val acc: 0.629

Epoch 3 - train loss: 0.816 - train acc: 0.627 - val loss: 0.796 - val acc: 0.629

**Batch size 32** taking about **150 seconds** for each episode to run.

Epoch 0 - train loss: 0.914 - train acc: 0.618 - val loss: 0.890 - val acc: 0.628

Epoch 1 - train loss: 0.900 - train acc: 0.627 - val loss: 0.890 - val acc: 0.628

Epoch 2 - train loss: 0.897 - train acc: 0.627 - val loss: 0.890 - val acc: 0.628

Epoch 3 - train loss: 0.899 - train acc: 0.627 - val loss: 0.890 - val acc: 0.628





**Observations from the Roberta(base) model:**

The Roberta model achieves a training accuracy of 67.4% and a validation accuracy of 68.5%, with a loss of 0.788 after the first epoch. The model improves slightly in the second epoch but does not improve further in the subsequent epochs. The validation loss remains constant at 0.742 from the second epoch onwards. This suggests that the model might have reached the highest it can learn, based on the parameters for the data provided.

It is interesting and worth noting that the Roberta (base) model performs better than the (base) DistilBERT model in terms of accuracy, as the former achieves a higher validation accuracy. However, the Roberta model does not improve significantly in terms of accuracy after the first epoch, which suggests that the model might not be able to learn much from the data after the initial training. This could be attributed to factors such as the size and quality of the training data, as well as the complexity of the model architecture.