<a href="https://colab.research.google.com/github/zen030/CourseProject/blob/main/NAIVE_LARGE_RoBERTa_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **This notebook is implemented and tested in Google Colab PRO environment**

# 1. Colab Configuration

In [None]:
!pip install transformers
!pip install PyDrive

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/2c/4e/4f1ede0fd7a36278844a277f8d53c21f88f37f3754abf76a5d6224f76d4a/transformers-3.4.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 7.7MB/s 
Collecting tokenizers==0.9.2
[?25l  Downloading https://files.pythonhosted.org/packages/7c/a5/78be1a55b2ac8d6a956f0a211d372726e2b1dd2666bb537fea9b03abd62c/tokenizers-0.9.2-cp36-cp36m-manylinux1_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 30.9MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 53.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[

In [None]:
import torch

# If GPU is available.
if torch.cuda.is_available():    
    # PyTorch to use the GPU    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If GPU is not available. Use the CPU.
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [None]:
# To manage dataset
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# train.json file location: https://drive.google.com/file/d/1d5lwaHPOUBAz7c-cNXXQeFn75ZV2HkUh/view?usp=sharing
# test.jsonl file location: https://drive.google.com/file/d/1vA3uyqy1TZmahgZ0PeNRFx67LuYeAkoW/view?usp=sharing

# The training dataset
# Google Drive file name
training_file = 'train.jsonl'
# Google Drive unique file ID
training_file_id = '1d5lwaHPOUBAz7c-cNXXQeFn75ZV2HkUh'


# The evaluation/testing dataset
# Google Drive file name
evaluation_file = 'test.jsonl'
# Google Drive unique file ID
test_jsonl_file_id = "1vA3uyqy1TZmahgZ0PeNRFx67LuYeAkoW"

In [None]:
# The files are shared to public.
# Login using Google Account to proceed.
# Copy-past the code.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':training_file_id})
downloaded.GetContentFile(training_file)

downloaded = drive.CreateFile({'id':test_jsonl_file_id})
downloaded.GetContentFile(evaluation_file)

# 2. Training

In [None]:
# import modules

import pandas as pd
import json
from transformers import RobertaTokenizer
from torch.utils.data import TensorDataset
from transformers import RobertaForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import f1_score

In [None]:
# Read jsonl file into list (of json)
with open(training_file) as f:
    # creating array of json
    lines = f.read().splitlines()
print(f'Number of lines in file: {len(lines)}')

# Normalize json into dataframe columns
df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
print(f'Number of records in Pandas DataFrame: {len(df)}')

# Lowercase response text.
# BERT Model can lowercase the text in the setting.
# I choose to lowercase the text here to have a uniform text format.
# In case I need to modify the text for data cleaning before training the model.
# df.response = df.response.str.lower()

Number of lines in file: 5000
Number of records in Pandas DataFrame: 5000


In [None]:
# Add 'label_value' integer column.
#    0 for SARCASM
#    1 for NOT_SARCASM
label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
df['label_value'] = df.label.replace(label_dict)

# Print maximum character length of 'response'
max_response_chars = df.response.str.len().max()
print(f"Maximum character length of 'response': {max_response_chars}")

# Adding 5 extra characters in case special token is needed by model
max_length = max_response_chars + 5 

# Preview response data
df

Maximum character length of 'response': 315


Unnamed: 0,label,response,context,label_value
0,SARCASM,@USER @USER @USER I don't get this .. obviousl...,[A minor child deserves privacy and should be ...,0
1,SARCASM,@USER @USER trying to protest about . Talking ...,[@USER @USER Why is he a loser ? He's just a P...,0
2,SARCASM,@USER @USER @USER He makes an insane about of ...,[Donald J . Trump is guilty as charged . The e...,0
3,SARCASM,@USER @USER Meanwhile Trump won't even release...,[Jamie Raskin tanked Doug Collins . Collins lo...,0
4,SARCASM,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,[Man ... y ’ all gone “ both sides ” the apoca...,0
...,...,...,...,...
4995,NOT_SARCASM,@USER You don't . I have purchased a lot on Am...,[@USER Apologies for the inconvenience you fac...,1
4996,NOT_SARCASM,@USER #Emotions you say 🤔 never knew that I th...,"[@USER 🤔 idk tho , I think I ’ m #hungry . But...",1
4997,NOT_SARCASM,"@USER @USER @USER You are so right ... "" Yes !...","[@USER @USER @USER Peace to you , and two coun...",1
4998,NOT_SARCASM,@USER @USER @USER Another lazy delusional vote...,[Bernie Sanders told Elizabeth Warren in priva...,1


In [None]:
# Preparing RoBERTa Token for the training dataset.
# RoBERTa Tokenizing using HuggingFace Transformers library
# (https://github.com/huggingface/transformers)
roberta_model = 'roberta-large'
batch_size = 5
epochs = 4

tokenizer = RobertaTokenizer.from_pretrained(roberta_model, do_lower_case=True)

encoded_data_training = tokenizer.batch_encode_plus(
    df.response.values,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=max_length,
    padding='max_length',
    return_tensors='pt'
)

input_ids_training = encoded_data_training['input_ids']
attention_masks_training = encoded_data_training['attention_mask']
labels_training = torch.tensor(df.label_value.values)


dataset_training = TensorDataset(input_ids_training, attention_masks_training, labels_training)

model = RobertaForSequenceClassification.from_pretrained(roberta_model,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)


data_loader_train = DataLoader(dataset_training, sampler=RandomSampler(dataset_training), batch_size=batch_size)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_training)*epochs)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

In [None]:
#######################################################
# Here we will train the model using training dataset #
#######################################################

from tqdm.notebook import tqdm # https://github.com/tqdm/tqdm
import random

# The random seed used to initialise the weights.
# and select the order of the training data.
# Set the seed value all over the place to make this reproducible.

# All of the above assume the code was run on a CPU.
# It is possible that when using the GPU to train the models, 
# the backend may be configured to use a sophisticated stack of GPU libraries, 
# and that some of these may introduce their own source of randomness.
# For example, there is some evidence that if we are using Nvidia cuDNN, 
# that this may introduce additional sources of randomness and prevent 
# the exact reproducibility of results.
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# loop over the full dataset for a number of epochs times.
for epoch in tqdm(range(epochs)):
    
    # To set the model into a training mode.
    model.train()
    
    # Measure the total training loss for each epoch.
    loss_train_total = 0
    # Progressbar to show the progress of the current epoch.
    progress_bar = tqdm(data_loader_train, desc='Epoch {:1d}'.format(epoch+1), leave=False, disable=False)
    
    # Process each batch in the current epoch.
    for batch in progress_bar:

        # Always clear any previously calculated gradients before performing a backward pass. 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()
        
        # Unpack current training batch.
        # batch contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        # This is the actual learning.
        outputs = model(**inputs)
        
        # Current training loss.
        loss = outputs[0]
        # Current total training loss.
        loss_train_total = loss_train_total + loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    # Save the trained RoBERTa model for the current epoch iteration    
    torch.save(model.state_dict(), f'NAIVE_RoBERTa_epoch_{epoch+1}.model')

    # Report the summary of epoch iteration
    tqdm.write(f'\nEpoch {epoch+1} is completed')

tqdm.write(f'\n#########################')
tqdm.write(f'\n# Training is completed #')
tqdm.write(f'\n#########################')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=1000.0, style=ProgressStyle(description_wid…


Epoch 1 is completed


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=1000.0, style=ProgressStyle(description_wid…


Epoch 2 is completed


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=1000.0, style=ProgressStyle(description_wid…


Epoch 3 is completed


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=1000.0, style=ProgressStyle(description_wid…


Epoch 4 is completed


#########################

# Training is completed #

#########################


# 3. Evaluation

To evaluate the trained model.
The evaluation of trained model used to post 'answer.txt' is available here:
https://github.com/zen030/CourseProject/blob/main/Evaluation_NAIVE_BERT_sentiment_analysis.ipynb

In [None]:
# Read testing/evaluation jsonl file into list (of json)
evaluation_data_file = evaluation_file
with open(evaluation_data_file) as f:
    # creating array of json
    lines = f.read().splitlines()
print(f'Number of lines in file: {len(lines)}')

# Normalize json into dataframe columns
df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
print(f'Number of records in Pandas DataFrame: {len(df)}')

# Lowercase response text
# BERT Model can lowercase the text in the setting
# I choose to lowercase the text here to have a uniform text format
# In case I need to modfiy the text for a reason
df.response = df.response.str.lower()

# Print maximum character length of 'response'
max_response_chars = df.response.str.len().max()
print(f"Maximum character length of 'response': {max_response_chars}")

# Adding 5 extra characters in case special token is needed by model
max_length = max_response_chars + 5

# Preview the response data
df

Number of lines in file: 1800
Number of records in Pandas DataFrame: 1800
Maximum character length of 'response': 310


Unnamed: 0,id,response,context
0,twitter_1,"@user @user @user my 3 year old , that just fi...","[Well now that ’ s problematic AF <URL>, @USER..."
1,twitter_2,@user @user how many verifiable lies has he to...,[Last week the Fake News said that a section o...
2,twitter_3,@user @user @user maybe docs just a scrub of a...,[@USER Let ’ s Aplaud Brett When he deserves i...
3,twitter_4,@user @user is just a cover up for the real ha...,[Women generally hate this president . What's ...
4,twitter_5,@user @user @user the irony being that he even...,"[Dear media Remoaners , you excitedly sharing ..."
...,...,...,...
1795,twitter_1796,@user @user @user is definitely the best out t...,[I have been a business customer of MWeb @USER...
1796,twitter_1797,@user @user ye let her out run wild and infect...,[A woman refuses to have her temperature taken...
1797,twitter_1798,"@user @user @user thanks for that , i would ha...",[The reason big government wants @USER out is ...
1798,twitter_1799,@user @user @user yes also #found this on #new...,[Happy #musicmonday and #thanks for #all your ...


In [None]:
roberta_model = 'roberta-large'
batch_size = 5

tokenizer = RobertaTokenizer.from_pretrained(roberta_model, do_lower_case=True)

print(tokenizer)

encoded_data_evaluation = tokenizer.batch_encode_plus(
    df.response.values,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=max_length,
    padding='max_length',
    return_tensors='pt'
)

input_ids_evaluation = encoded_data_evaluation['input_ids']
attention_masks_evaluation = encoded_data_evaluation['attention_mask']

dataset_evaluation = TensorDataset(input_ids_evaluation, attention_masks_evaluation)

dataloader_eval = DataLoader(dataset_evaluation, sampler=SequentialSampler(dataset_evaluation), batch_size=batch_size)

print(input_ids_evaluation[0])
print(input_ids_evaluation[1])


dataset_evaluation

PreTrainedTokenizer(name_or_path='roberta-large', vocab_size=50265, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})
tensor([    0,  1039, 12105,   787, 12105,   787, 12105,   127,   155,    76,
          793,  2156,    14,    95,  1550,  2600,   295,  5810, 49446,     8,
          172,   553, 

<torch.utils.data.dataset.TensorDataset at 0x7fd94f9ff780>

In [None]:
import torch.nn.functional as F

# If GPU is available.
if torch.cuda.is_available():    
    # PyTorch to use the GPU    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If GPU is not available. Use the CPU.
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# To set the model into a training mode
label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
roberta_model = 'roberta-large'
model = RobertaForSequenceClassification.from_pretrained(roberta_model,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
model.load_state_dict(torch.load('NAIVE_RoBERTa_epoch_4.model', map_location=torch.device(device)))

model.eval()
loss_val_total = 0
predictions = []

for batch in dataloader_eval:
  batch = tuple(b.to(device) for b in batch)
  inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

  with torch.no_grad():
    # evaluate the validation dataset
    output = model(**inputs)
    logits = output[0]
    print(batch[0])
    print(batch[1])
    print(logits)
    # print(output)
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

predictions = np.concatenate(predictions, axis=0)
preds_flat = np.argmax(predictions, axis=1).flatten()

print(preds_flat)

print('Evaluation is done')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.weight', 'classif

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        [-2.4927,  1.7502],
        [ 1.5573, -1.0457]], device='cuda:0')
tensor([[    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ...,     1,     1,     1]], device='cuda:0')
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
tensor([[-2.5553,  1.7609],
        [ 1.5573, -1.0457],
        [-2.5555,  1.7610],
        [-2.5525,  1.7631],
        [ 1.5573, -1.0457]], device='cuda:0')
tensor([[    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ...,     1,     1,     1],
        [    0,  1039, 12105,  ..

In [None]:
tokenizer.convert_ids_to_tokens([12105])

['user']

In [None]:
# Generate 'answer.txt'
i = 1
for pred in enumerate(preds_flat):
  if pred[1] == 0:
    text = 'SARCASM'
  else:
    text = 'NOT_SARCASM'
  print('twitter_{0},{1}'.format(i, text))
  i = i + 1

twitter_1,NOT_SARCASM
twitter_2,SARCASM
twitter_3,SARCASM
twitter_4,NOT_SARCASM
twitter_5,SARCASM
twitter_6,SARCASM
twitter_7,NOT_SARCASM
twitter_8,SARCASM
twitter_9,NOT_SARCASM
twitter_10,SARCASM
twitter_11,SARCASM
twitter_12,NOT_SARCASM
twitter_13,NOT_SARCASM
twitter_14,NOT_SARCASM
twitter_15,SARCASM
twitter_16,NOT_SARCASM
twitter_17,SARCASM
twitter_18,SARCASM
twitter_19,SARCASM
twitter_20,NOT_SARCASM
twitter_21,NOT_SARCASM
twitter_22,SARCASM
twitter_23,NOT_SARCASM
twitter_24,SARCASM
twitter_25,SARCASM
twitter_26,SARCASM
twitter_27,NOT_SARCASM
twitter_28,NOT_SARCASM
twitter_29,NOT_SARCASM
twitter_30,NOT_SARCASM
twitter_31,SARCASM
twitter_32,NOT_SARCASM
twitter_33,NOT_SARCASM
twitter_34,SARCASM
twitter_35,NOT_SARCASM
twitter_36,SARCASM
twitter_37,SARCASM
twitter_38,SARCASM
twitter_39,SARCASM
twitter_40,SARCASM
twitter_41,SARCASM
twitter_42,NOT_SARCASM
twitter_43,SARCASM
twitter_44,NOT_SARCASM
twitter_45,NOT_SARCASM
twitter_46,NOT_SARCASM
twitter_47,SARCASM
twitter_48,SARCASM
twitter_4

# 4. Summary

Final model and evaluation result are available here: 
https://github.com/zen030/CourseProject/blob/main/Evaluation_NAIVE_BERT_sentiment_analysis.ipynb