<a href="https://colab.research.google.com/github/zen030/CourseProject/blob/main/NAIVE_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Colab Configuration

In [None]:
!pip install transformers



In [None]:
import torch

# If GPU is available.
if torch.cuda.is_available():    
    # PyTorch to use the GPU    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If GPU is not available. Use the CPU.
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


# 2. Training

In [None]:
# import modules
# F1 baseline: 0.723

import pandas as pd
import json
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.metrics import f1_score


# Helper functions
def calculate_f1_score(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')


def accuracy_per_category(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')


In [None]:
# 1. Read jsonl file into list (of json)
train_data_file = 'sample_data/train.jsonl'
with open(train_data_file) as f:
    # creating array of json
    lines = f.read().splitlines()
print(f'Number of lines in file: {len(lines)}')

# 2. Normalize json into dataframe columns
df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
print(f'Number of records in Pandas DataFrame: {len(df)}')

# 2.1 lower texts
#     We will use 'bert-large-uncased' model
# df.response = df.response.str.lower()

Number of lines in file: 5000
Number of records in Pandas DataFrame: 5000


In [None]:
# 4. Add 'label_value' integer column.
#    0 for SARCASM
#    1 for NOT_SARCASM
label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
df['label_value'] = df.label.replace(label_dict)

# remove @user from 'response' column
# remove <url> from 'response' column
df['response'] = df['response'].str.replace('@user ', '')
df['response'] = df['response'].str.replace('<url>', '')

# 3. Print maximum character length of 'response'
max_response_chars = df.response.str.len().max()
print(f"Maximum character length of 'response': {max_response_chars}")

# Adding 5 extra characters in case special token is needed by model
max_length = max_response_chars + 5 

df

Maximum character length of 'response': 315


Unnamed: 0,label,response,context,label_value
0,SARCASM,@USER @USER @USER I don't get this .. obviousl...,[A minor child deserves privacy and should be ...,0
1,SARCASM,@USER @USER trying to protest about . Talking ...,[@USER @USER Why is he a loser ? He's just a P...,0
2,SARCASM,@USER @USER @USER He makes an insane about of ...,[Donald J . Trump is guilty as charged . The e...,0
3,SARCASM,@USER @USER Meanwhile Trump won't even release...,[Jamie Raskin tanked Doug Collins . Collins lo...,0
4,SARCASM,@USER @USER Pretty Sure the Anti-Lincoln Crowd...,[Man ... y ’ all gone “ both sides ” the apoca...,0
...,...,...,...,...
4995,NOT_SARCASM,@USER You don't . I have purchased a lot on Am...,[@USER Apologies for the inconvenience you fac...,1
4996,NOT_SARCASM,@USER #Emotions you say 🤔 never knew that I th...,"[@USER 🤔 idk tho , I think I ’ m #hungry . But...",1
4997,NOT_SARCASM,"@USER @USER @USER You are so right ... "" Yes !...","[@USER @USER @USER Peace to you , and two coun...",1
4998,NOT_SARCASM,@USER @USER @USER Another lazy delusional vote...,[Bernie Sanders told Elizabeth Warren in priva...,1


In [None]:
# 6. Preparing BERT Token for the training dataset.
#    BERT Tokenizing using HuggingFace Transformers library
#    (https://github.com/huggingface/transformers)
bert_model = 'bert-large-uncased'
batch_size = 5
epochs = 4

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

encoded_data_training = tokenizer.batch_encode_plus(
    df.response.values,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=max_length,
    padding='max_length',
    return_tensors='pt'
)

input_ids_training = encoded_data_training['input_ids']
attention_masks_training = encoded_data_training['attention_mask']
labels_training = torch.tensor(df.label_value.values)


dataset_training = TensorDataset(input_ids_training, attention_masks_training, labels_training)

model = BertForSequenceClassification.from_pretrained(bert_model,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)


data_loader_train = DataLoader(dataset_training, sampler=RandomSampler(dataset_training), batch_size=batch_size)

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataset_training)*epochs)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [None]:
#######################################################
# Here we will train the model using training dataset #
#######################################################

from tqdm.notebook import tqdm # https://github.com/tqdm/tqdm
import random

# The random seed used to initialise the weights 
# and select the order of the training data.
# Set the seed value all over the place to make this reproducible.
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# loop over the full dataset for a number of epochs times.
for epoch in tqdm(range(epochs)):
    
    # To set the model into a training mode.
    model.train()
    
    # Measure the total training loss for each epoch.
    loss_train_total = 0
    # Progressbar to show the progress of the current epoch.
    progress_bar = tqdm(data_loader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    
    # Process each batch in the current epoch.
    for batch in progress_bar:

        # Always clear any previously calculated gradients before performing a backward pass. 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()
        
        # Unpack current training batch.
        # batch contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        # This is the actual learning.
        outputs = model(**inputs)
        
        # Current training loss.
        loss = outputs[0]
        # Current total training loss.
        loss_train_total = loss_train_total + loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
    # Save the trained BERT model for the current epoch iteration    
    torch.save(model.state_dict(), f'finetuned_NAIVE_BERT_epoch_{epoch}.model')

    # Report the summary of epoch iteration
    tqdm.write(f'\nEpoch {epoch}')

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 0', max=1000.0, style=ProgressStyle(description_wid…


Epoch 0


HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=1000.0, style=ProgressStyle(description_wid…


Epoch 1


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=1000.0, style=ProgressStyle(description_wid…


Epoch 2


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=1000.0, style=ProgressStyle(description_wid…


Epoch 3



# 3. Evaluation

In [None]:
# 1. Read jsonl file into list (of json)
evaluation_data_file = 'sample_data/test.jsonl'
with open(evaluation_data_file) as f:
    # creating array of json
    lines = f.read().splitlines()
print(f'Number of lines in file: {len(lines)}')

# 2. Normalize json into dataframe columns
df = pd.json_normalize(pd.DataFrame(lines)[0].apply(json.loads))
print(f'Number of records in Pandas DataFrame: {len(df)}')

# 2.1 lower texts
#     We will use 'bert-large-uncased' model
df.response = df.response.str.lower()
df

Number of lines in file: 1800
Number of records in Pandas DataFrame: 1800


Unnamed: 0,id,response,context
0,twitter_1,"@USER @USER @USER My 3 year old , that just fi...","[Well now that ’ s problematic AF <URL>, @USER..."
1,twitter_2,@USER @USER How many verifiable lies has he to...,[Last week the Fake News said that a section o...
2,twitter_3,@USER @USER @USER Maybe Docs just a scrub of a...,[@USER Let ’ s Aplaud Brett When he deserves i...
3,twitter_4,@USER @USER is just a cover up for the real ha...,[Women generally hate this president . What's ...
4,twitter_5,@USER @USER @USER The irony being that he even...,"[Dear media Remoaners , you excitedly sharing ..."
...,...,...,...
1795,twitter_1796,@USER @USER @USER is definitely the best out t...,[I have been a business customer of MWeb @USER...
1796,twitter_1797,@USER @USER Ye let her out run wild and infect...,[A woman refuses to have her temperature taken...
1797,twitter_1798,"@USER @USER @USER Thanks for that , I would ha...",[The reason big government wants @USER out is ...
1798,twitter_1799,@USER @USER @USER Yes also #found this on #new...,[Happy #musicmonday and #thanks for #all your ...


In [None]:
# remove @user from 'response' column
# remove <url> from 'response' column
df['response'] = df['response'].str.replace('@user ', '')
df['response'] = df['response'].str.replace('<url>', '')

# 3. Print maximum character length of 'response'
max_response_chars = df.response.str.len().max()
print(f"Maximum character length of 'response': {max_response_chars}")

# Adding 5 extra characters in case special token is needed by model
max_length = max_response_chars + 5 

df

Maximum character length of 'response': 310


Unnamed: 0,id,response,context
0,twitter_1,"@USER @USER @USER My 3 year old , that just fi...","[Well now that ’ s problematic AF <URL>, @USER..."
1,twitter_2,@USER @USER How many verifiable lies has he to...,[Last week the Fake News said that a section o...
2,twitter_3,@USER @USER @USER Maybe Docs just a scrub of a...,[@USER Let ’ s Aplaud Brett When he deserves i...
3,twitter_4,@USER @USER is just a cover up for the real ha...,[Women generally hate this president . What's ...
4,twitter_5,@USER @USER @USER The irony being that he even...,"[Dear media Remoaners , you excitedly sharing ..."
...,...,...,...
1795,twitter_1796,@USER @USER @USER is definitely the best out t...,[I have been a business customer of MWeb @USER...
1796,twitter_1797,@USER @USER Ye let her out run wild and infect...,[A woman refuses to have her temperature taken...
1797,twitter_1798,"@USER @USER @USER Thanks for that , I would ha...",[The reason big government wants @USER out is ...
1798,twitter_1799,@USER @USER @USER Yes also #found this on #new...,[Happy #musicmonday and #thanks for #all your ...


In [None]:
bert_model = 'bert-large-uncased'
batch_size = 5

tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)

encoded_data_evaluation = tokenizer.batch_encode_plus(
    df.response.values,
    add_special_tokens=True,
    return_attention_mask=True,
    max_length=max_length,
    padding='max_length',
    return_tensors='pt'
)

input_ids_evaluation = encoded_data_evaluation['input_ids']
attention_masks_evaluation = encoded_data_evaluation['attention_mask']

dataset_evaluation = TensorDataset(input_ids_evaluation, attention_masks_evaluation)

dataloader_eval = DataLoader(dataset_evaluation, sampler=SequentialSampler(dataset_evaluation), batch_size=batch_size)

In [None]:
import torch.nn.functional as F 

# If GPU is available.
if torch.cuda.is_available():    
    # PyTorch to use the GPU    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If GPU is not available. Use the CPU.
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# To set the model into a training mode
label_dict = {'SARCASM': 0, 'NOT_SARCASM': 1}
bert_model = 'bert-large-uncased'
model = BertForSequenceClassification.from_pretrained(bert_model,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)
model.to(device)
model.load_state_dict(torch.load('finetuned_NAIVE_BERT_epoch_3.model', map_location=torch.device(device)))

model.eval()
loss_val_total = 0
predictions = []

for batch in dataloader_eval:
  batch = tuple(b.to(device) for b in batch)
  inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}

  with torch.no_grad():
    # evaluate the validation dataset
    output = model(**inputs)
    logits = output[0]
    # print(output)
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

predictions = np.concatenate(predictions, axis=0)
preds_flat = np.argmax(predictions, axis=1).flatten()

print(preds_flat)

print('Evaluation is done')

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

[1 0 0 ... 1 1 1]
Evaluation is done


In [None]:
print(len(preds_flat))
i = 1
for pred in enumerate(preds_flat):
  if pred[1] == 0:
    text = 'SARCASM'
  else:
    text = 'NOT_SARCASM'
  print('twitter_{0},{1}'.format(i, text))
  i = i + 1

1800
twitter_1,NOT_SARCASM
twitter_2,SARCASM
twitter_3,SARCASM
twitter_4,NOT_SARCASM
twitter_5,SARCASM
twitter_6,SARCASM
twitter_7,NOT_SARCASM
twitter_8,SARCASM
twitter_9,NOT_SARCASM
twitter_10,SARCASM
twitter_11,NOT_SARCASM
twitter_12,SARCASM
twitter_13,SARCASM
twitter_14,NOT_SARCASM
twitter_15,SARCASM
twitter_16,SARCASM
twitter_17,SARCASM
twitter_18,SARCASM
twitter_19,SARCASM
twitter_20,NOT_SARCASM
twitter_21,NOT_SARCASM
twitter_22,SARCASM
twitter_23,NOT_SARCASM
twitter_24,SARCASM
twitter_25,SARCASM
twitter_26,SARCASM
twitter_27,NOT_SARCASM
twitter_28,NOT_SARCASM
twitter_29,SARCASM
twitter_30,NOT_SARCASM
twitter_31,SARCASM
twitter_32,NOT_SARCASM
twitter_33,NOT_SARCASM
twitter_34,SARCASM
twitter_35,NOT_SARCASM
twitter_36,SARCASM
twitter_37,SARCASM
twitter_38,SARCASM
twitter_39,SARCASM
twitter_40,SARCASM
twitter_41,SARCASM
twitter_42,NOT_SARCASM
twitter_43,SARCASM
twitter_44,NOT_SARCASM
twitter_45,SARCASM
twitter_46,NOT_SARCASM
twitter_47,SARCASM
twitter_48,SARCASM
twitter_49,NOT_SARCA

# 4. Summary

- f1 = 0.757905138339921
- recall = 0.8522222222222222
- precision = 0.6823843416370107

Note: Baseline scores (f1, recall and precision) are 0.723