<a href="https://colab.research.google.com/github/zen030/CourseProject/blob/main/NAIVE_LARGE_BERT_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **This notebook is implemented and tested in Google Colab PRO environment**

The BERT model is trained using a <b>NAIVE</b> approach. Twitter tweet sentiment analysis is done only by considering the 'Response' text when training and evaluation the model. The 'Context' text is completely ignored.

F1 Score is calculated by submitting the 'answers' to the LiveDataLab Leaderboard.

# 1. Colab Configuration
- Install transformers module
- Install PyDrive module

In [1]:
!pip install transformers
!pip install PyDrive



# 2. Modules import
Import all the required modules for the notebook.

In [2]:
import torch
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
import json
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from tqdm.notebook import tqdm
import random
import torch.nn.functional as F

# 3. Dataset preparation & Google Drive Mounting
- Copy train.jsonl and test.jsonl from Google Drive to Colab current session
- Mount Google Drive to the Colab current session

In [3]:
# train.json file location: https://drive.google.com/file/d/1d5lwaHPOUBAz7c-cNXXQeFn75ZV2HkUh/view?usp=sharing
# test.jsonl file location: https://drive.google.com/file/d/1vA3uyqy1TZmahgZ0PeNRFx67LuYeAkoW/view?usp=sharing

# The training dataset
# Google Drive file name
training_file = 'train.jsonl'
# Google Drive unique file ID
training_file_id = '1d5lwaHPOUBAz7c-cNXXQeFn75ZV2HkUh'


# The evaluation/testing dataset
# Google Drive file name
evaluation_file = 'test.jsonl'
# Google Drive unique file ID
test_jsonl_file_id = "1vA3uyqy1TZmahgZ0PeNRFx67LuYeAkoW"

In [4]:
# The files are shared to public.
# Login using Google Account to proceed.
# Copy-past the code.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

downloaded = drive.CreateFile({'id':training_file_id})
downloaded.GetContentFile(training_file)

downloaded = drive.CreateFile({'id':test_jsonl_file_id})
downloaded.GetContentFile(evaluation_file)

In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/uiuc')

Drive already mounted at /content/uiuc; to attempt to forcibly remount, call drive.mount("/content/uiuc", force_remount=True).


# 4. The Notebook Class
Class methods:
- read_dataset_file: Read json line file and store it in Panda DataFrame
- create_bert_model: Creating BERT Large Uncased pre-trained model
- run_training: Training the model with training dataset
- evaluate_model: Evaluate the trained-model with testing/evaluation dataset
- write_answers: Store the answers (sentiment analysis) to a file in the current colab session

In [6]:
class BERT_Model:
  def __init__(self):
    self.model_name = 'bert-large-uncased'
    self.label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
    self.batch_size = 5
    self.epochs = 4
    self.tokenizer = BertTokenizer.from_pretrained(self.model_name, do_lower_case=True)
    if torch.cuda.is_available():    
      self.device = torch.device('cuda')
    else:
      self.device = torch.device('cpu')

  def read_dataset_file(self, the_file):
    with open(the_file) as f:
      # creating array of json
      lines = f.read().splitlines()
      df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
      df.response = df.response.str.lower()
      if 'label' in df:
        df['label_value'] = df.label.replace(self.label_dict)
    return df

  def create_bert_model(self, df, lr, eps):
    encoded_data = self.tokenizer.batch_encode_plus(
        df.response.values,
        add_special_tokens=True,
        return_attention_mask=True,
        max_length=df.response.str.len().max()+5,
        padding='max_length',
        return_tensors='pt'
    )

    self.training_dataset = TensorDataset(encoded_data['input_ids'], 
                            encoded_data['attention_mask'], 
                            torch.tensor(df.label_value.values))

    self.model = BertForSequenceClassification.from_pretrained(self.model_name,
                                                          num_labels=len(self.label_dict),
                                                          output_attentions=False,
                                                          output_hidden_states=False)
    self.model.to(self.device)
    self.optimizer = AdamW(self.model.parameters(), lr=lr, eps=eps)
    self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=0, 
                                                     num_training_steps=len(self.training_dataset)*self.epochs)

  def run_training(self, file_save, seed_val=17):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    data_loader = DataLoader(self.training_dataset, sampler=RandomSampler(self.training_dataset), batch_size=self.batch_size)

    # loop over the full dataset for a number of epochs times.
    for epoch in tqdm(range(self.epochs)):
        
        # To set the model into a training mode.
        self.model.train()
        
        # Measure the total training loss for each epoch.
        loss_train_total = 0
        # Progressbar to show the progress of the current epoch.
        progress_bar = tqdm(data_loader, desc='Epoch {:1d}'.format(epoch+1), leave=False, disable=False)
        
        # Process each batch in the current epoch.
        for batch in progress_bar:

            # Always clear any previously calculated gradients before performing a backward pass. 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            self.model.zero_grad()
            
            # Unpack current training batch.
            # batch contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            batch = tuple(b.to(self.device) for b in batch)
            
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         batch[2],
                    }       

            # This is the actual learning.
            outputs = self.model(**inputs)
            
            # Current training loss.
            loss = outputs[0]
            # Current total training loss.
            loss_train_total = loss_train_total + loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            self.optimizer.step()

            # Update the learning rate.
            self.scheduler.step()
            
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
            
        # Save the trained BERT model for the current epoch iteration    
        torch.save(self.model.state_dict(), f'{file_save}_{epoch+1}.model')

        # Report the summary of epoch iteration
        tqdm.write(f'\nEpoch {epoch+1} is completed')
    tqdm.write(f'\n#########################')
    tqdm.write(f'\n# Training is completed #')
    tqdm.write(f'\n#########################')


  def evaluate_model(self, file_name, df):
    self.model.load_state_dict(torch.load(file_name, map_location=torch.device(self.device)))
    self.model.eval()
    loss_val_total = 0
    predictions = []

    encoded_data = self.tokenizer.batch_encode_plus(
        df.response.values,
        add_special_tokens=True,
        return_attention_mask=True,
        max_length=df.response.str.len().max()+5,
        padding='max_length',
        return_tensors='pt'
    )

    dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'])
    dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=self.batch_size)

    for batch in dataloader:
      batch = tuple(b.to(self.device) for b in batch)
      inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

      with torch.no_grad():
        # evaluate the validation dataset
        output = self.model(**inputs)
        logits = output[0]
        # print(output)
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)

    predictions = np.concatenate(predictions, axis=0)
    preds_flat = np.argmax(predictions, axis=1).flatten()

    print('######################')
    print('# Evaluation is done #')
    print('######################')

    return preds_flat

  def write_answers(self, file_name, preds_flat):
    f= open(file_name,"w")
    i = 1
    for pred in enumerate(preds_flat):
      if pred[1] == 0:
        text = 'SARCASM'
      else:
        text = 'NOT_SARCASM'
      f.write('twitter_{0},{1}'.format(i, text))
      i = i + 1
    f.close()

# 5. Model Creation, Training and Evaluation
Create, train and evaluate the BERT model using the following parameters: 
- Learning rates, each for 4 epoch iterations:
  - 2e-5
  - 3e-5
  - 5e-5
- Optimizer Epsilon value: 1e-8
- Random Seed value: 17
- Evaluation is done only for model epoch # 4

In [7]:
the_model = BERT_Model()

the_model.df_train = the_model.read_dataset_file(training_file)
the_model.df_eval = the_model.read_dataset_file(evaluation_file)

print(the_model.df_train)
print(the_model.df_eval)

            label  ... label_value
0         SARCASM  ...           0
1         SARCASM  ...           0
2         SARCASM  ...           0
3         SARCASM  ...           0
4         SARCASM  ...           0
...           ...  ...         ...
4995  NOT_SARCASM  ...           1
4996  NOT_SARCASM  ...           1
4997  NOT_SARCASM  ...           1
4998  NOT_SARCASM  ...           1
4999  NOT_SARCASM  ...           1

[5000 rows x 4 columns]
                id  ...                                            context
0        twitter_1  ...  [Well now that ’ s problematic AF <URL>, @USER...
1        twitter_2  ...  [Last week the Fake News said that a section o...
2        twitter_3  ...  [@USER Let ’ s Aplaud Brett When he deserves i...
3        twitter_4  ...  [Women generally hate this president . What's ...
4        twitter_5  ...  [Dear media Remoaners , you excitedly sharing ...
...            ...  ...                                                ...
1795  twitter_1796  ...  [I ha

## 5.1. Training & Evaluation #1
- Learning rate: 2e-5
- Epsilon: 1e-8
- Seed value: 17

In [None]:
epoch_to_evaluate = 4

the_model.create_bert_model(the_model.df_train, 2e-5, 1e-8)
the_model.run_training('lr_2e-5_1e-8_17', 17)
preds_flat = the_model.evaluate_model(f'lr_2e-5_1e-8_17_{epoch_to_evaluate}.model', the_model.df_eval)
the_model.write_answers(f'lr_2e-5_1e-8_17_{epoch_to_evaluate}_answer.txt', preds_flat)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=1000.0, style=ProgressStyle(description_wid…


Epoch 1 is completed


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=1000.0, style=ProgressStyle(description_wid…


Epoch 2 is completed


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=1000.0, style=ProgressStyle(description_wid…

## 5.2. Training & Evaluation #2
- Learning rate: 3e-5
- Epsilon: 1e-8
- Seed value: 17

In [None]:
epoch_to_evaluate = 4

the_model.create_bert_model(the_model.df_train, 3e-5, 1e-8)
the_model.run_training('lr_3e-5_1e-8_17', 17)
preds_flat = the_model.evaluate_model(f'lr_3e-5_1e-8_17_{epoch_to_evaluate}.model', the_model.df_eval)
the_model.write_answers(f'lr_3e-5_1e-8_17_{epoch_to_evaluate}_answer.txt', preds_flat)

## 5.3. Training & Evaluation #3
- Learning rate: 5e-5
- Epsilon: 1e-8
- Seed value: 17

In [None]:
epoch_to_evaluate = 4

the_model.create_bert_model(the_model.df_train, 2e-5, 1e-8)
the_model.run_training('lr_5e-5_1e-8_17', 17)
preds_flat = the_model.evaluate_model(f'lr_5e-5_1e-8_17_{epoch_to_evaluate}.model', the_model.df_eval)
the_model.write_answers(f'lr_5e-5_1e-8_17_{epoch_to_evaluate}_answer.txt', preds_flat)

# 6. Save Results to Google Drive

## 6.1. Save Training & Evaluation #1 Results
- Learning rate: 2e-5
- Epsilon: 1e-8
- Seed value: 17

In [None]:
!cp 'lr_2e-5_1e-8_17_4_answer.txt' 'uiuc/My Drive/cs410/final_project/lr_2e-5_1e-8_17_4_answer.txt'
!cp 'lr_2e-5_1e-8_17_4.model' 'uiuc/My Drive/cs410/final_project/lr_2e-5_1e-8_17_4.model'

## 6.2. Save Training & Evaluation #2 Results
- Learning rate: 3e-5
- Epsilon: 1e-8
- Seed value: 17

In [None]:
!cp 'lr_3e-5_1e-8_17_4_answer.txt' 'uiuc/My Drive/cs410/final_project/lr_3e-5_1e-8_17_4_answer.txt'
!cp 'lr_3e-5_1e-8_17_4.model' 'uiuc/My Drive/cs410/final_project/lr_3e-5_1e-8_17_4.model'

## 6.3. Save Training & Evaluation #3 Results
- Learning rate: 5e-5
- Epsilon: 1e-8
- Seed value: 17

In [None]:
!cp 'lr_5e-5_1e-8_17_4_answer.txt' 'uiuc/My Drive/cs410/final_project/lr_5e-5_1e-8_17_4_answer.txt'
!cp 'lr_5e-5_1e-8_17_4.model' 'uiuc/My Drive/cs410/final_project/lr_5e-5_1e-8_17_4.model'

# 7. Summary
Best F1 score, <b>0.757905138339921</b>, is achieved using:
- Learning rate: 2e-8
- Epoch: # 4
- Epsilon: 1e-8
- Seed value: 17