# BERT: Fine-Tuning for Sentence Classification
* BERT Fine-Tuning Tutorial with PyTorch [Link](https://colab.research.google.com/drive/1pTuQhug6Dhl9XalKB0zUGf4FIdYFlpcX), [Link](https://mccormickml.com/2019/07/22/BERT-fine-tuning/), [YT Link 1](https://www.youtube.com/watch?v=x66kkDnbzi4), [YT Link 2](https://colab.research.google.com/corgiredirector?site=https%3A%2F%2Fyoutu.be%2FHnvb9b7a_Ps)

In [None]:
#pip install tensorflow-gpu #Anything above 2.10 is not supported on the GPU on Windows Native
#python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))" #Verify install
#!pip install -q transformers

## Import Library

In [1]:
import tensorflow as tf

In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras_preprocessing.sequence import pad_sequences #from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, BertConfig, AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

UsageError: Line magic function `%` not found.


### Check GPU available

In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
#https://stackoverflow.com/questions/66083545/could-not-load-dynamic-library-cudnn64-8-dll-dlerror-cudnn64-8-dll-not-found
#download the cudnn64_8.dll and put it in C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.x\bin
#then try python -c "import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))"

Found GPU at: /device:GPU:0


### Specifying CUDA as the device for Torch

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1060'

## Import Dataset

In [4]:
#https://nyu-mll.github.io/CoLA/cola_public_1.1.zip
df = pd.read_csv("data\in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])
df.shape

(8551, 4)

In [5]:
df.sample(10)

Unnamed: 0,sentence_source,label,label_notes,sentence
2593,l-93,1,,Jessica sprayed paint over the table.
5974,c_13,1,,Jeff must not have eaten the deep fried muffin.
6591,g_81,1,,People are said to do such crazy things at suc...
3743,ks08,1,,It could be more detrimental.
6831,m_02,1,,Ayala sent the diamond necklace back.
6841,m_02,0,*,The manager presented the foreman a gold watch.
3882,ks08,0,*,They eager to leave the meeting.
658,bc01,1,,John got the book from Bill.
1746,r-67,1,,"Handsome though Dick is, I'm still going to ma..."
5943,c_13,1,,The medal was given to the soldier by Phillip.


## Data Preprocessing

In [6]:
sentences = df.sentence.values #pull out sentences for X
labels = df.label.values #pull out labels for y

#add special characters for BERT (CLS and SEP tokens at begining and end of each sentence)
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]

#create a BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) #pretrained tokenizer ref: https://huggingface.co/bert-base-uncased
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print (tokenized_texts[0]) #tokenize the first sentence.

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.54MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 9.24kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 190kB/s]


['[CLS]', 'our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[SEP]']


In [7]:
MAX_LEN = 128 #our max sequence length is 42 and original paper used 512. We choose 128 to leave room.
# Convert tokens to index numbers of BERT vocab
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
#pad the input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [8]:
# Create attention masks (1 for mask, 0 for padding)
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [9]:
#Splitting data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1)

In [10]:
#move training and test set to GPU via torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Set batch size and data iterator
batch_size = 32 #author recommends 16 or 32
#we use torch DataLoader so as not to load all data into memory during training but at each iteration
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


## Build BERT Model

In [12]:
#Configure BERT
configuration = BertConfig()
#initialise vert-base-uncased config
model = BertModel(configuration)
#check model config
configuration = model.config
print(configuration)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.33.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [13]:
#load a Bert Uncased Base Model from Hugging Face and move to GPU
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.cuda()

Downloading model.safetensors: 100%|██████████| 440M/440M [01:16<00:00, 5.72MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
#Optimizer Grouped Parameters
## Bert only has bias terms, no gammer or beta parameters so we don't apply weight decay to any params that include these token names 
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [#optimizer_grouped_parameters` only includes the parameter values not names
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    
    # Filter for parameters which *do* include those.
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
#The Hyperparemeters for the Training Loop 
epochs = 4

optimizer = AdamW(optimizer_grouped_parameters,
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )
# Total number of training steps. We get number of batches from length of train_dataloader
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler. #num_warmup_steps = 0, # Default value in run_glue.py
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
#Accuracy Measurement Function
## calculates the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

## Train BERT Model

In [None]:
#The Training Loop