Colab notebook to train the ROBERTA model for sentiment analysis in the vaccine data

# Install packages

In [1]:
# Install packages

%pip install torch
%pip install transformers
%pip install datasets
%pip install transformers[torch]

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Environment settings

In [2]:
# mount the colab session to google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Disabe W&B to conserve resources
import os
os.environ["WANDB_DISABLED"] = "true"

## Import packages

In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from datasets import load_dataset
from datasets import load_metric

from wordcloud import WordCloud

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, DataCollatorWithPadding
from transformers import Trainer

# Prepare dataset



In [5]:
train_df_path =  "/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/data/train_subset.csv"
val_df_path = "/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/data/eval_subset.csv"

train_raw_df = pd.read_csv(train_df_path)
val_raw_df = pd.read_csv(val_df_path)

In [6]:
train_raw_df.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,tweet_length
0,YMRMEDME,Mickey's Measles has gone international <url>,Neutral,1.0,45
1,5GV8NEZS,S1256 [NEW] Extends exemption from charitable ...,Neutral,1.0,122
2,EI10PS46,<user> your ignorance on vaccines isn't just ...,Positive,0.666667,118
3,OM26E6DG,Pakistan partly suspends polio vaccination pro...,Neutral,1.0,93
4,NBBY86FX,In other news I've gone up like 1000 mmr,Neutral,1.0,40


In [7]:
# drop the null rows, if any

train_df = train_raw_df.dropna()
val_df = val_raw_df.dropna()

In [8]:
print("Train data shape : ", train_df.shape)
print("Val data shape", val_df.shape)

Train data shape :  (7999, 5)
Val data shape (2000, 5)


In [9]:
train_df.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,tweet_length
0,YMRMEDME,Mickey's Measles has gone international <url>,Neutral,1.0,45
1,5GV8NEZS,S1256 [NEW] Extends exemption from charitable ...,Neutral,1.0,122
2,EI10PS46,<user> your ignorance on vaccines isn't just ...,Positive,0.666667,118
3,OM26E6DG,Pakistan partly suspends polio vaccination pro...,Neutral,1.0,93
4,NBBY86FX,In other news I've gone up like 1000 mmr,Neutral,1.0,40


In [10]:
val_df.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,tweet_length
0,R7JPIFN7,Children's Museum of Houston to Offer Free Vac...,Positive,1.0,61
1,2DD250VN,<user> no. I was properly immunized prior to t...,Positive,1.0,102
2,ESEVBTFN,<user> thx for posting vaccinations are impera...,Positive,1.0,120
3,S17ZU0LC,This Baby Is Exactly Why Everyone Needs To Vac...,Positive,0.666667,69
4,IIN5D33V,"Meeting tonight, 8:30pm in room 322 of the stu...",Positive,1.0,118


In [11]:
# save the data

train_path = "/content/train.csv"
val_path = "/content/val.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)

In [12]:
# Load the dataset in format for ROBERTA training

dataset = load_dataset('csv',
                        data_files={'train': train_path,
                                    'val': val_path},
                       encoding = "ISO-8859-1")


Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

**ROBERTA tokenizer**

In [13]:
# Create a ROBERTA tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [14]:
# Function to map labels for ROBERTA training
def map_labels(data):
  label = data['label']

  encoding = 0
  if label == 'Negative': # -1
    encoding = 0
  elif label == 'Neutral': # 0
    encoding = 1
  elif label == 'Positive': # 1
    encoding = 2

  return {'labels': encoding}


# Function to tokenize tweets
def tokenize_data(example):
  return tokenizer(example['safe_text'], padding='max_length', truncation=True, max_length=256)


# Do the tokenization of dataset
dataset = dataset.map(tokenize_data, batched=True)


# Do the final transformation
removable_columns = ['tweet_id', 'safe_text', 'label', 'agreement']
dataset = dataset.map(map_labels, remove_columns=removable_columns)

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7999 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
# View the dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['tweet_length', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 7999
    })
    val: Dataset({
        features: ['tweet_length', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [16]:
# split the main dataset with train and val set

train_dataset = dataset['train'].shuffle(seed=5)
val_dataset = dataset['val'].shuffle(seed=0)

In [17]:
# create pytorch tensors with padding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training configuration and model creation

In [18]:
weight_fol = "/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERTA/weights"
log_fol = "/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERTA/logs"

logging_steps = 500
epochs = 2

In [19]:
# Configure training parameters and hyper-parameters

training_args = TrainingArguments(
    weight_fol,  # folder to save weights
    logging_dir=log_fol, # Directory for logs
    logging_steps=logging_steps, # logging interval

    num_train_epochs=epochs,  # no of epochs

    load_best_model_at_end=True,  # load best model at training end
    evaluation_strategy="steps",
    save_strategy="steps",
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [20]:
# Define metric

metric = load_metric("accuracy")

def compute_metrics(eval_predictions):
  logits, labels = eval_predictions
  predictions = np.argmax(logits, axis=-1)

  return metric.compute(predictions=predictions, references=labels)


  metric = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

**ROBERTA model**

In [21]:
# Load pretrained ROBERTA model to fine tune

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [22]:
# create trainer

trainer = Trainer(
  model = model,
  tokenizer = tokenizer,
  train_dataset = train_dataset,
  eval_dataset = val_dataset,
  data_collator = data_collator,
  compute_metrics = compute_metrics,
  args = training_args,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


# Train

In [23]:
# Start the training

trainer.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.7366,0.603973,0.758
1000,0.6248,0.582617,0.7855
1500,0.4685,0.631452,0.797
2000,0.4235,0.586734,0.8


TrainOutput(global_step=2000, training_loss=0.563357437133789, metrics={'train_runtime': 886.2329, 'train_samples_per_second': 18.052, 'train_steps_per_second': 2.257, 'total_flos': 2104644228406272.0, 'train_loss': 0.563357437133789, 'epoch': 2.0})

# Evaluate

In [24]:
# Evaluate model

trainer.evaluate()

{'eval_loss': 0.5826169848442078,
 'eval_accuracy': 0.7855,
 'eval_runtime': 28.9899,
 'eval_samples_per_second': 68.99,
 'eval_steps_per_second': 8.624,
 'epoch': 2.0}

Evaluation accuracy : **78.55%**

# Save final weights

In [25]:
# Save the trained model and tokenizer

model.save_pretrained("/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERTA/final_weights/model")
tokenizer.save_pretrained("/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERTA/final_weights/tokenizer")

('/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERT/final_weights/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERT/final_weights/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERT/final_weights/tokenizer/vocab.json',
 '/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERT/final_weights/tokenizer/merges.txt',
 '/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERT/final_weights/tokenizer/added_tokens.json',
 '/content/drive/MyDrive/University/NLP/Sentiment_Analysis__vaccine_data/ROBERT/final_weights/tokenizer/tokenizer.json')