<a href="https://colab.research.google.com/github/zen030/CourseProject/blob/main/CONTEXT_LARGE_BERT_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **This notebook is implemented and tested in Google Colab PRO environment**

# 1. Colab Configuration

In [11]:
!pip install transformers
!pip install PyDrive



In [12]:
import torch

# If GPU is available.
if torch.cuda.is_available():    
    # PyTorch to use the GPU    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If GPU is not available. Use the CPU.
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


In [14]:
# To manage dataset
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# train.json file location: https://drive.google.com/file/d/1d5lwaHPOUBAz7c-cNXXQeFn75ZV2HkUh/view?usp=sharing
# test.jsonl file location: https://drive.google.com/file/d/1vA3uyqy1TZmahgZ0PeNRFx67LuYeAkoW/view?usp=sharing

# The training dataset
# Google Drive file name
training_file = 'train.jsonl'
# Google Drive unique file ID
training_file_id = '1d5lwaHPOUBAz7c-cNXXQeFn75ZV2HkUh'


# The evaluation/testing dataset
# Google Drive file name
evaluation_file = 'test.jsonl'
# Google Drive unique file ID
test_jsonl_file_id = "1vA3uyqy1TZmahgZ0PeNRFx67LuYeAkoW"

In [16]:
# The files are shared to public.
# Login using Google Account to proceed.
# Copy-past the code.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':training_file_id})
downloaded.GetContentFile(training_file)

downloaded = drive.CreateFile({'id':test_jsonl_file_id})
downloaded.GetContentFile(evaluation_file)

# 2. Training

In [18]:
# import modules

import pandas as pd
import json
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import f1_score

In [20]:
# Read jsonl file into list (of json)
with open(training_file) as f:
    # creating array of json
    lines = f.read().splitlines()
print(f'Number of lines in file: {len(lines)}')

# Normalize json into dataframe columns
df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
print(f'Number of records in Pandas DataFrame: {len(df)}')

# Lowercase response text.
# BERT Model can lowercase the text in the setting.
# I choose to lowercase the text here to have a uniform text format.
# In case I need to modify the text for data cleaning before training the model.
# df.response = df.response.str.lower()

Number of lines in file: 5000
Number of records in Pandas DataFrame: 5000


In [22]:
# Add 'label_value' integer column.
#    0 for SARCASM
#    1 for NOT_SARCASM
label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
df['label_value'] = df.label.replace(label_dict)

# Print maximum character length of 'response'
max_response_chars = df.response.str.len().max()
print(f"Maximum character length of 'response': {max_response_chars}")

# Adding 5 extra characters in case special token is needed by model
max_length = 512

# Preview response data
df

Maximum character length of 'response': 315


Unnamed: 0,label,response,context,label_value
0,SARCASM,@USER @USER @USER I don't get this .. obviousl...,[A minor child deserves privacy and should be ...,0
1,SARCASM,@USER @USER trying to protest about . Talking ...,[@USER @USER Why is he a loser ? He's just a P...,0
2,SARCASM,@USER @USER @USER He makes an insane about of ...,[Donald J . Trump is guilty as charged . The e...,0
3,SARCASM,@USER @USER Meanwhile Trump won't even release...,[Jamie Raskin tanked Doug Collins . Collins lo...,0
4,SARCASM,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,[Man ... y ‚Äô all gone ‚Äú both sides ‚Äù the apoca...,0
...,...,...,...,...
4995,NOT_SARCASM,@USER You don't . I have purchased a lot on Am...,[@USER Apologies for the inconvenience you fac...,1
4996,NOT_SARCASM,@USER #Emotions you say ü§î never knew that I th...,"[@USER ü§î idk tho , I think I ‚Äô m #hungry . But...",1
4997,NOT_SARCASM,"@USER @USER @USER You are so right ... "" Yes !...","[@USER @USER @USER Peace to you , and two coun...",1
4998,NOT_SARCASM,@USER @USER @USER Another lazy delusional vote...,[Bernie Sanders told Elizabeth Warren in priva...,1


In [24]:
# Combine response and context
df['response'] = df['response'] + '[SEP]'
for index in df.index: 
  contexts = ''
  for context in reversed(df['context'][index]):
    contexts = contexts + ' ' + context
  df['response'][index] = df['response'][index] + contexts

# Check first record
df.response[0]

"@USER @USER @USER I don't get this .. obviously you do care or you would've moved right along .. instead you decided to care and troll her ..[SEP] @USER If your child isn't named Barron ... #BeBest Melania couldn't care less . Fact . üíØ A minor child deserves privacy and should be kept out of politics . Pamela Karlan , you should be ashamed of your very angry and obviously biased public pandering , and using a child to do it ."

In [26]:
# Preparing BERT Token for the training dataset.
# BERT Tokenizing using HuggingFace Transformers library
# (https://github.com/huggingface/transformers)
bert_model = 'bert-large-uncased'
batch_size = 1
epochs = 4

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

encoded_data_training = tokenizer.batch_encode_plus(
    df.response.values,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=max_length,
    padding='max_length',
    return_tensors='pt',
    truncation=True
)

input_ids_training = encoded_data_training['input_ids']
attention_masks_training = encoded_data_training['attention_mask']
labels_training = torch.tensor(df.label_value.values)


dataset_training = TensorDataset(input_ids_training, attention_masks_training, labels_training)

model = BertForSequenceClassification.from_pretrained(bert_model,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)


data_loader_train = DataLoader(dataset_training, sampler=RandomSampler(dataset_training), batch_size=batch_size)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset_training)*epochs)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [None]:
#######################################################
# Here we will train the model using training dataset #
#######################################################

from tqdm.notebook import tqdm # https://github.com/tqdm/tqdm
import random

# The random seed used to initialise the weights.
# and select the order of the training data.
# Set the seed value all over the place to make this reproducible.

# All of the above assume the code was run on a CPU.
# It is possible that when using the GPU to train the models, 
# the backend may be configured to use a sophisticated stack of GPU libraries, 
# and that some of these may introduce their own source of randomness.
# For example, there is some evidence that if we are using Nvidia cuDNN, 
# that this may introduce additional sources of randomness and prevent 
# the exact reproducibility of results.
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# loop over the full dataset for a number of epochs times.
for epoch in tqdm(range(epochs)):
    
    # To set the model into a training mode.
    model.train()
    
    # Measure the total training loss for each epoch.
    loss_train_total = 0
    # Progressbar to show the progress of the current epoch.
    progress_bar = tqdm(data_loader_train, desc='Epoch {:1d}'.format(epoch+1), leave=False, disable=False)
    
    # Process each batch in the current epoch.
    for batch in progress_bar:

        # Always clear any previously calculated gradients before performing a backward pass. 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()
        
        # Unpack current training batch.
        # batch contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        # This is the actual learning.
        outputs = model(**inputs)
        
        # Current training loss.
        loss = outputs[0]
        # Current total training loss.
        loss_train_total = loss_train_total + loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    # Save the trained BERT model for the current epoch iteration    
    torch.save(model.state_dict(), f'CONTEXT_BASE_BERT_epoch_{epoch+1}.model')

    # Report the summary of epoch iteration
    tqdm.write(f'\nEpoch {epoch+1} is completed')

tqdm.write(f'\n#########################')
tqdm.write(f'\n# Training is completed #')
tqdm.write(f'\n#########################')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=5000.0, style=ProgressStyle(description_wid‚Ä¶

# 3. Evaluation

To evaluate the trained model.
The evaluation of trained model used to post 'answer.txt' is available here:
https://github.com/zen030/CourseProject/blob/main/Evaluation_NAIVE_BERT_sentiment_analysis.ipynb

In [None]:
# Read testing/evaluation jsonl file into list (of json)
evaluation_data_file = evaluation_file
with open(evaluation_data_file) as f:
    # creating array of json
    lines = f.read().splitlines()
print(f'Number of lines in file: {len(lines)}')

# Normalize json into dataframe columns
df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
print(f'Number of records in Pandas DataFrame: {len(df)}')

# Lowercase response text
# BERT Model can lowercase the text in the setting
# I choose to lowercase the text here to have a uniform text format
# In case I need to modfiy the text for a reason
df.response = df.response.str.lower()

# Print maximum character length of 'response'
max_response_chars = df.response.str.len().max()
print(f"Maximum character length of 'response': {max_response_chars}")

# Adding 5 extra characters in case special token is needed by model
max_length = 512

# Preview the response data
df

In [None]:
# Combine response and context
df['response'] = df['response'] + '[SEP]'
for index in df.index: 
  contexts = ''
  for context in reversed(df['context'][index]):
    contexts = contexts + ' ' + context
  df['response'][index] = df['response'][index] + contexts

# Check first record
df.response[0]

In [None]:
bert_model = 'bert-large-uncased'
batch_size = 5

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

print(tokenizer)

encoded_data_evaluation = tokenizer.batch_encode_plus(
    df.response.values,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=max_length,
    padding='max_length',
    return_tensors='pt',
    truncation=True
)

input_ids_evaluation = encoded_data_evaluation['input_ids']
attention_masks_evaluation = encoded_data_evaluation['attention_mask']

dataset_evaluation = TensorDataset(input_ids_evaluation, attention_masks_evaluation)

dataloader_eval = DataLoader(dataset_evaluation, sampler=SequentialSampler(dataset_evaluation), batch_size=batch_size)

print(input_ids_evaluation[0])
print(input_ids_evaluation[1])


dataset_evaluation

In [None]:
import torch.nn.functional as F

# If GPU is available.
if torch.cuda.is_available():    
    # PyTorch to use the GPU    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If GPU is not available. Use the CPU.
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# To set the model into a training mode
label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
bert_model = 'bert-large-uncased'
model = BertForSequenceClassification.from_pretrained(bert_model,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
model.load_state_dict(torch.load('CONTEXT_BASE_BERT_epoch_4.model', map_location=torch.device(device)))

model.eval()
loss_val_total = 0
predictions = []

for batch in dataloader_eval:
  batch = tuple(b.to(device) for b in batch)
  inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

  with torch.no_grad():
    # evaluate the validation dataset
    output = model(**inputs)
    logits = output[0]
    print(batch[0])
    print(batch[1])
    print(logits)
    # print(output)
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

predictions = np.concatenate(predictions, axis=0)
preds_flat = np.argmax(predictions, axis=1).flatten()

print(preds_flat)

print('Evaluation is done')

In [None]:
tokenizer.convert_ids_to_tokens([12105])

In [None]:
# Generate 'answer.txt'
i = 1
for pred in enumerate(preds_flat):
  if pred[1] == 0:
    text = 'SARCASM'
  else:
    text = 'NOT_SARCASM'
  print('twitter_{0},{1}'.format(i, text))
  i = i + 1

# 4. Summary

Final model and evaluation result are available here: 
https://github.com/zen030/CourseProject/blob/main/Evaluation_NAIVE_BERT_sentiment_analysis.ipynb