## INTRO

Author : Gheddi

Data : 3084 records of tweets withs its sentiments as labels

Objective : Fine tuning BERT for sentiment classification

Result : 89.9% F1 score on 6 class target variable

In [32]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv(
    '/content/drive/MyDrive/# DATA SCIENCE/my project/LLMs/smile-annotations-final.csv',
    names = ['id','text', 'category'])
# set column id as the dataframe index
df.set_index('id', inplace=True)

## EDA AND PREPROCESSING

In [4]:
df.head()

Unnamed: 0_level_0,text,category
id,Unnamed: 1_level_1,Unnamed: 2_level_1
611857364396965889,@aandraous @britishmuseum @AndrewsAntonio Merc...,nocode
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy
614877582664835073,@Sofabsports thank you for following me back. ...,happy
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy


In [5]:
# check the target class distribution
df.category.value_counts()

nocode               1572
happy                1137
not-relevant          214
angry                  57
surprise               35
sad                    32
happy|surprise         11
happy|sad               9
disgust|angry           7
disgust                 6
sad|disgust             2
sad|angry               2
sad|disgust|angry       1
Name: category, dtype: int64

In [6]:
# removing records with multiple sentiment
df = df[~df.category.str.contains('\|')]

In [7]:
# removing records with nocode target value because its sentinment is undocumented
df = df[df.category != 'nocode']
df.category.value_counts()

happy           1137
not-relevant     214
angry             57
surprise          35
sad               32
disgust            6
Name: category, dtype: int64

In [8]:
possible_labels = df.category.unique()

label_dict = {}
for index,possible_labels in enumerate(possible_labels):
  label_dict[possible_labels] = index

label_dict

{'happy': 0,
 'not-relevant': 1,
 'angry': 2,
 'disgust': 3,
 'sad': 4,
 'surprise': 5}

In [9]:
df['label'] = df.category.replace(label_dict)
df.head(4)

Unnamed: 0_level_0,text,category,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
614484565059596288,Dorian Gray with Rainbow Scarf #LoveWins (from...,happy,0
614746522043973632,@SelectShowcase @Tate_StIves ... Replace with ...,happy,0
614877582664835073,@Sofabsports thank you for following me back. ...,happy,0
611932373039644672,@britishmuseum @TudorHistory What a beautiful ...,happy,0


## TRAIN-VALIDATION SPLIT


In [10]:
X_train, X_val, Y_train, Y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size = 0.15,
    random_state = 33,
    # because class imbalance, using statified approach to ensure each class had some representation in each set
    stratify = df.label.values)

In [11]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [12]:
df.groupby(['category', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
category,label,data_type,Unnamed: 3_level_1
angry,2,train,48
angry,2,val,9
disgust,3,train,5
disgust,3,val,1
happy,0,train,966
happy,0,val,171
not-relevant,1,train,182
not-relevant,1,val,32
sad,4,train,27
sad,4,val,5


## LOADING TOKENIZER AND ENCODING

In [13]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [14]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case = True
)

In [15]:
from transformers import BertForSequenceClassification

In [16]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask = True,
    pad_to_max_length = True,
    max_length = 256,
    return_tensors = 'pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [18]:
len(dataset_train)

1258

In [19]:
len(dataset_val)

223

## DEFINING PRETRAINED BERT

In [20]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## SETTING DATA LOADER

In [21]:
batch_size = 32

dataloader_train = DataLoader(
    dataset_train,
    sampler = RandomSampler(dataset_train),
    batch_size = 16
)

dataloader_val = DataLoader(
    dataset_val,
    sampler = RandomSampler(dataset_val),
    batch_size = batch_size
)

## SETTING UP OPTIMIZER AND SCHEDULER

In [22]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [23]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)

## SETTING THE EVALUATION METRIC

In [33]:
def f1_score_func(preds, labels):
  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average = 'weighted')

In [45]:
def accuracy_per_class(preds, labels):
  label_dict_inverse = {v:k for k,v in label_dict.items()}

  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  for label in np.unique(labels_flat):
    y_preds = preds_flat[labels_flat==label]
    y_true = labels_flat[labels_flat==label]
    print(f'Class: {label_dict_inverse[label]}')
    print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')
    print('\n')

## MODEL TRAINING

In [26]:
import random
import numpy as np

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [28]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


In [35]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='epoch:{:1d}'.format(epoch), leave=False, disable = False)

    for batch in progress_bar:
      model.zero_grad()

      batch = tuple(b.to(device) for b in batch)

      inputs = {
          'input_ids' : batch[0],
          'attention_mask' : batch[1],
          'labels' : batch[2]

      }

      outputs = model(**inputs)

      loss = outputs[0]
      loss_train_total += loss.item()
      loss.backward()
      ## clipping the gradient
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


      optimizer.step()
      scheduler.step()

      progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    torch.save(model.state_dict(), f'models/Bert_ft_epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss : {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'validation loss {val_loss}')
    tqdm.write(f'f1 score weighted : {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

epoch:1:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 1
Training loss : 0.19817326199027557
validation loss 0.3522493200642722
f1 score weighted : 0.8819759104963674


epoch:2:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 2
Training loss : 0.17371403786672068
validation loss 0.3332562212433134
f1 score weighted : 0.8862903645152731


epoch:3:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 3
Training loss : 0.15601805002204602
validation loss 0.33050586708954405
f1 score weighted : 0.8950270330017744


epoch:4:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 4
Training loss : 0.14451354527492313
validation loss 0.33191896975040436
f1 score weighted : 0.8995989104952167


epoch:5:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 5
Training loss : 0.1453372061818461
validation loss 0.33105200103351046
f1 score weighted : 0.8995989104952167


epoch:6:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 6
Training loss : 0.1454109217782941
validation loss 0.330449956868376
f1 score weighted : 0.8995989104952167


epoch:7:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 7
Training loss : 0.14763343763313716
validation loss 0.33074142251695904
f1 score weighted : 0.8995989104952167


epoch:8:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 8
Training loss : 0.14098102744975233
validation loss 0.33182708493300844
f1 score weighted : 0.8995989104952167


epoch:9:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 9
Training loss : 0.14422244694131084
validation loss 0.3319186291524342
f1 score weighted : 0.8995989104952167


epoch:10:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 10
Training loss : 0.14577700422864548
validation loss 0.3328483785901751
f1 score weighted : 0.8995989104952167


## MODEL EVALUATION

In [36]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
model.to(device)
pass

In [38]:
# selecting the model on 8th epoch as the best performing model
model.load_state_dict(
    torch.load('models/Bert_ft_epoch8.model',
               map_location = torch.device('cpu')))

<All keys matched successfully>

In [39]:
_, predictions, true_vals = evaluate(dataloader_val)

In [46]:
# displaying correct prediction per class
accuracy_per_class(predictions, true_vals)

Class: happy
Accuracy: 165/171


Class: not-relevant
Accuracy: 27/32


Class: angry
Accuracy: 7/9


Class: disgust
Accuracy: 0/1


Class: sad
Accuracy: 0/5


Class: surprise
Accuracy: 4/5




## CONCLUSION

1. Leveraging LLMs helped me to achieve good performance while having so little training dataset.

2. Models performance is suboptimal on certain target class because class imbalance problem.

3. additional training data might be needed to handle class imbalance.