# Install the necessary packages

In [1]:
!pip install pytorch_pretrained_bert pytorch-nlp
!pip install tokenizers



In [2]:
!wget https://raw.githubusercontent.com/sayankotor/BERT_botcamp19/master/bert-base-uncased-vocab.txt

--2020-01-25 22:13:18--  https://raw.githubusercontent.com/sayankotor/BERT_botcamp19/master/bert-base-uncased-vocab.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘bert-base-uncased-vocab.txt.2’


2020-01-25 22:13:19 (5.52 MB/s) - ‘bert-base-uncased-vocab.txt.2’ saved [231508/231508]



# Check hardware specs

In [3]:
!nvidia-smi

Sat Jan 25 22:13:20 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P8    10W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [4]:
!lscpu |grep 'Model name'

Model name:          Intel(R) Xeon(R) CPU @ 2.30GHz


In [5]:
!lscpu | grep 'Thread'

Thread(s) per core:  2


# Finetuning Hands-on

In [6]:
import torch
import torchtext
import random
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import torch.nn.functional as F
from dataclasses import dataclass
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from pytorch_pretrained_bert import BertModel, BertTokenizer
import os
import time
import random
from torch.nn.utils import clip_grad_norm_
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [0]:
random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed(10)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Parameter description

Here are some of the main parameters you will want to consider when finetuning BERT:

* **Gradient Clipping**: If the norm of the gradient gets above max_grad_norm, We divide the gradient by its L2 norm. This gradient clipping method avoids exploiding gradients.
* **Learning rate**: The `learning_rate` parameter is very important as it controls how we update the already trained parameters from the Language Modelling Task. If this parameter is too high, we will notice a forgetting of the previous task. It needs to be carefully tuned.
* **Sequence length**: The attention mechanism scales in O(L^2). So you should avoid handling sequences larger than what you really need. 

In [0]:
@dataclass
class ArgsBert:
    max_seq_length: int = 256 # The maximum total input sequence length after WordPiece tokenization.
    learning_rate: float = 3e-6 # Initial Learning rate for Adam
    num_train_epochs: int = 3 # epochs
    batch_size: int = 4
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loss = torch.nn.BCEWithLogitsLoss().cuda()
    clip_gradient_max_norm: float = 1.0 

In [0]:
args = ArgsBert()

## Preparing the Data

The dataset we will be using is the IMDB movie review sentiment analysis dataset. The data looks like this:

  - **Review**: 'This movie caught me by surprise . For years I have avoided many of Harold Lloyd \'s sound pictures ( as well as those of Keaton ) because they have a generally well - deserved reputation for being lousy compared to the silent films because the basic formula has been lost . However , when I saw this film I was pleasantly surprised to find I actually liked it, ... 

  - **Sentiment**: "pos"

In [0]:
TEXT = torchtext.data.Field(tokenize = 'spacy', include_lengths = True) # helper to tokenize using spacy
LABEL = torchtext.data.LabelField(dtype = torch.float)

def get_dataloader_bert(tokens_ids, masks, lbls, random=True, batch_size=64):
    """""
    Returns a dataloader to iterate over the data. 
    Arguments:
    - tokens_ids:
    - masks: 
    - lbls: 
    """""
    tokens_tensor = torch.tensor(tokens_ids)
    y_tensor = torch.tensor(lbls.reshape(-1, 1)).float()
    masks_tensor = torch.tensor(masks)

    dataset = TensorDataset(tokens_tensor, masks_tensor, y_tensor)
    if random:
      sampler = RandomSampler(dataset)
      dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)
    else:
      sampler = SequentialSampler(dataset)
      dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    return dataloader

def fetch_imdb_data():
    """""
    Returns the imdb dataset
    """""
    full_train_data, val_data_ = torchtext.datasets.IMDB.splits(TEXT, LABEL)
    return full_train_data, val_data_

In [0]:
# Download the IMDB data
full_train_data, full_test_data = fetch_imdb_data()

# We randomly subsample the IMDB dataset (the training would take too long with the full dataset)
train_data = [full_train_data[random.randint(1,24000)] for _ in range(1000)]
test_data = [full_test_data[random.randint(1,24000)] for _ in range(3000)]

In [0]:
bert_train_texts = list(" ".join(train_data[i].text) for i in range(len(train_data))) 
train_labels = list(train_data[i].label for i in range(len(train_data)))

bert_test_texts = list(" ".join(test_data[i].text) for i in range(len(test_data)))
test_labels = list(test_data[i].label for i in range(len(test_data)))

In [0]:
train_data = torchtext.data.Dataset(train_data, full_train_data.fields)
test_data = torchtext.data.Dataset(test_data, full_test_data.fields)

## Tokenization

- The input to the bert model are word-pieces ([Original paper](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf)). Standard tokens are broken down into word pieces through the use of a WordPiece tokenizer. 

- A WordPiece tokenizer breaks the unknown words into multiple subwords.
For example, if the word "chore" does not belong to the vocabulary as a single piece, it might get split into two pieces belonging to the vocabulary: 'cho' and '##re'. 

- All the subwords start with the "#" symbol except for the first subword in the word. Imagine the words "played", "playing" are rare words and thus would not occur in a normal vocabulary. These words would be considered into the wordpiece tokenizer into this form: [`play`, `##ed`] and [`play`, `##ing`]. 

- You can have a look at the file `bert-base-uncased-vocab.txt` in your environnment to have an idea of the words present in the vocabular

- Wordpiece tokenizers tends to be quite slow, however some efficient implementations exist: tokenizers from the [huggingface Library](https://github.com/huggingface/tokenizers) are much faster than the standard naive implementations (Implemented in Rust with python bindings).

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Here is an example of splitting a rare token into wordpieces !
tokenizer.tokenize("supercalifragilisticexpialidocious")

['super',
 '##cal',
 '##if',
 '##rag',
 '##ilis',
 '##tic',
 '##ex',
 '##pia',
 '##lid',
 '##oc',
 '##ious']

In [0]:
train_tokens = [['[CLS]'] + tokenizer.tokenize(t)[:args.max_seq_length] + ['[SEP]'] for t in  bert_train_texts] # tokenize reviews in train
test_tokens = [['[CLS]'] + tokenizer.tokenize(t)[:args.max_seq_length] + ['[SEP]'] for t in  bert_test_texts] # tokenize reviews in test

## Padding and preparing tensors

In [0]:
bert_train_tokens_ids = [tokenizer.convert_tokens_to_ids(review) for review in train_tokens] # wordpieces to ids
bert_test_tokens_ids = [tokenizer.convert_tokens_to_ids(review) for review in test_tokens] # wordpieces to ids

# Pad up to max_seq_length
bert_train_tokens_ids = pad_sequences(bert_train_tokens_ids, maxlen=args.max_seq_length, truncating="post", padding="post", dtype="int")
bert_test_tokens_ids = pad_sequences(bert_test_tokens_ids, maxlen=args.max_seq_length, truncating="post", padding="post", dtype="int")

In [0]:
train_y = np.array(train_labels) == 'pos' # gives a vector of bool [True, False, False, ...]
test_y = np.array(test_labels) == 'pos'

In [0]:
# Attention masking for not attending padded tokens
bert_train_masks = [[float(token_id > 0) for token_id in sent_token_ids] for sent_token_ids in bert_train_tokens_ids]
bert_test_masks = [[float(token_id > 0) for token_id in sent_token_ids] for sent_token_ids in bert_test_tokens_ids]

- Below is the output of the preprocessing (tokenization, padding etc). You can see that we have a tensor of IDs for the first sentence pointing to our vocabulary.
- The first id is always 101 refering to the **[CLS]** token in the vocabulary.
- The padding is done by the id=0 corresponding to **[PAD]** token in the vocabulary.

In [19]:
bert_train_tokens_ids[0]

array([  101,  1045,  2323,  2031,  6618,  2008,  2151,  3185,  2007,
        1996, 14955,  3334,  3351,  2923,  3203,  1999,  2009,  2003,
        1050,  1005,  1056,  2183,  2000,  2022,  2204,  1012,  2009,
        2941,  4627,  2041,  3100,  1010,  2021,  2076,  1996,  2034,
        4028,  3496,  2017,  2424,  2041,  2008,  1996,  3185,  2017,
        1005,  2128,  3666,  2003,  1037,  3185,  2503,  1997,  1037,
        3185,  1012,  2045,  1005,  1055,  2111,  3564,  1999,  1037,
        3185,  3004,  3666,  2008,  3185,  1012,  2028,  2611,  1999,
        1996,  4378,  2003,  2061, 15703,  2008,  1045,  2052,  2031,
        2357,  2105,  1998, 21384,  2014,  1012,  1037,  2978,  4326,
        1010,  2021,  2521,  2013,  2204,  1012,   102,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

- The labels are presented below

In [20]:
test_y[0:5]

array([False,  True,  True,  True,  True])

## Finetuning Bert

- Let's define our model `BertFinetune`
  - This model will be composed of the uncased version of bert + a dense layer on top of it for binary classification.
  
- Here are the steps for finetuning:
  - We load our pretrained LM transformer model with already trained weights
  - Add a new linear layer on top of the trained model
  - Finetune the parameters of the newly defined model on the downstream task
  - Evaluate


In [0]:
class BertFinetune(torch.nn.Module):
    def __init__(self):
        super(BertFinetune, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = torch.nn.Linear(768, 1)
    
    def forward(self, tokens, masks):
        # pooled_output will just consider the hidden state of the first token (i.e., the [CLS])
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        logits = self.linear(pooled_output)
        return logits, pooled_output

bert = BertFinetune()
bert = bert.cuda() # We push our model on the GPU!

In [0]:
train_dataloader = get_dataloader_bert(bert_train_tokens_ids, bert_train_masks, train_y, batch_size=args.batch_size)
test_dataloader = get_dataloader_bert(bert_test_tokens_ids, bert_test_masks, test_y, random=False, batch_size=args.batch_size)

- To have an idea of the form of pooled output, its shape is [4, 768] which is the [CLS] hidden states for the 4 sentences in the batch

In [23]:
for data in train_dataloader: # Just doing one forward pass to check the output of our BERT based network
  token_ids, masks, labels = tuple(t.to(args.device) for t in data)
  pb, cls = bert(token_ids, masks)
  print(cls.shape)
  break

torch.Size([4, 768])


**Finally! Let's fine-tune our model !**

In [24]:
optimizer = torch.optim.Adam(bert.parameters(), lr=args.learning_rate)

losses = []
bert.train()

for epoch in range(args.num_train_epochs):
    train_loss = 0
    for step_num, data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(args.device) for t in data) # Moving the input tensors to the GPU

        logits, cls = bert(token_ids, masks) # Forward pass
        loss = args.loss(logits, labels) # compute our classification loss

        train_loss += loss.item()  
        
        bert.zero_grad()
        loss.backward() 
        clip_grad_norm_(parameters=bert.parameters(), max_norm=args.clip_gradient_max_norm)

        optimizer.step()
        
    print(f'Epoch: {epoch}  --- loss:  {train_loss/(step_num + 1)}')

Epoch: 0  --- loss:  0.6436396466493607
Epoch: 1  --- loss:  0.4031721138656139
Epoch: 2  --- loss:  0.2642683334052563


In [25]:
!nvidia-smi

Sat Jan 25 22:18:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.44       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P0    42W /  70W |   5821MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [26]:
def get_test_scores(model, test_dataloader, args, test_y):
  s = time.time()
  model.eval()
  model.to(args.device) # make sure the model is on the right device
  bert_predicted = []
  with torch.no_grad(): # no need for gradient computation for a simple eval
      for batch_data in test_dataloader:
          token_ids, masks, labels = tuple(t.to(args.device) for t in batch_data) # sending tensor to the right device
          logits, _ = model(token_ids, masks) # Forward pass with our BERT model
          bert_predicted += list(torch.sigmoid(logits.cpu().detach()[:, 0]).numpy() > 0.5) # Get predictions
  model.train()

  print("Time (Seconds) ________________", (time.time() - s))
  print("___")
  print(classification_report(test_y, bert_predicted))

get_test_scores(bert, test_dataloader, args, test_y)

Time (Seconds) ________________ 56.39476752281189
___
              precision    recall  f1-score   support

       False       0.89      0.87      0.88      1466
        True       0.88      0.89      0.89      1534

    accuracy                           0.88      3000
   macro avg       0.88      0.88      0.88      3000
weighted avg       0.88      0.88      0.88      3000



# Distillation Hands-on

As we saw in the slides, distillation can be done using the logits of the teacher and the student. In our case the teacher is our fine-tuned BERT model. The following function is computing the logits on the training set. We will use those logits to do the distillation.

In [0]:
def get_training_logits(bert, bert_train_tokens_ids, bert_train_masks, train_y, args):
  """""
  Function to get the training logits from the already trained bert model
  """""
  bert.eval()
  all_logits = []
  train_logits_loader = get_dataloader_bert(bert_train_tokens_ids, bert_train_masks, train_y, random=False, batch_size=args.batch_size)
  lbls = []
  with torch.no_grad():
      for batch in train_logits_loader:
          token_ids, masks, labels = tuple(t.to(args.device) for t in batch)
          log, _ = bert(token_ids, masks)
          all_logits.extend(torch.flatten(log).cpu().numpy())
          lbls.extend(torch.flatten(labels).cpu().numpy())
  bert.train()
  return torch.Tensor(tuple(all_logits)), torch.Tensor(tuple(lbls))

In [0]:
all_logits, lbls = get_training_logits(bert, bert_train_tokens_ids, bert_train_masks, train_y, args)
logits_loader = DataLoader(all_logits, batch_size=64, shuffle=False) # Dataloader for logits

In [29]:
all_logits.shape

torch.Size([1000])

## Prepare the data

In [0]:
TEXT.build_vocab(train_data, max_size = 25000)
LABEL.build_vocab(train_data,)

In [0]:
@dataclass
class ArgsClf:
    num_train_epochs: int = 5 # epochs
    batch_size: int = 64 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    loss = torch.nn.BCELoss()
    clip_gradient_max_norm: float = 5.0
    learning_rate=0.01

In [0]:
args_clf = ArgsClf()

In [0]:
train_iterator= torchtext.data.BucketIterator(
    train_data, 
    sort_key=None,
    shuffle=False,
    batch_size = args_clf.batch_size,
    device = args_clf.device)

test_iterator= torchtext.data.BucketIterator(
    test_data, 
    sort_key=None,
    shuffle=False,
    batch_size = args_clf.batch_size,
    device = args_clf.device)

## Model Definition

- Very simple model with 1 embedding layer and a Linear layer

In [0]:
import torch.nn as nn
import torch.nn.functional as F

class BinaryCLF(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(embedding_dim, output_dim)
    def forward(self, text):        
        embedded = self.embedding(text).permute(1, 0, 2)        
        return self.fc(F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1))

In [0]:
#@title
##### Uncomment if you want to get all the logits for the 25k datapoints #####

def get_full_logits():
  """""
  Returns the logits on the full IMDB Dataset
  Can be used for data augmentation and distillation
  """""
  # format data input 
  train_texts_full = list(" ".join(full_train_data[i].text) for i in range(len(full_train_data))) # get the sentences
  train_tokens_ids_full = list(tokenizer.encode(t) for t in train_texts_full) # use wordpiece tokenizer  train_tokens_ids_full = [tok.ids[:args.max_seq_length-1] for tok in train_tokens_ids_full] # truncate to max_seq_length
  train_tokens_ids_full = pad_sequences(train_tokens_ids_full, maxlen=args.max_seq_length, 
                                        truncating="post", padding="post", dtype="int") # pad sequences
  train_masks_full = [[float(i > 0) for i in ii] for ii in train_tokens_ids_full] 
  train_masks_tensor_full = torch.tensor(train_tokens_ids_full)
  train_masks_tensor_full = torch.tensor(train_masks_full)
  dataset = TensorDataset(torch.tensor(train_tokens_ids_full), train_masks_tensor_full)
  sampler = SequentialSampler(dataset)
  dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)
  bert.eval()
  bert.cuda()
  all_logits = []
  with torch.no_grad():
      for batch in dataloader:
          token_ids, masks = tuple(t.to(args.device) for t in batch)
          log, _ = bert(token_ids, masks)
          all_logits.extend(torch.flatten(log).cpu().numpy())

  return torch.Tensor(tuple(all_logits))

#logits = get_full_logits()

## Helper Distillation

- We provided for you the bert logits for the training set iterator, `train_iterator` which are in the `logits_loader`
- We also provided a function `get_full_logits` that you can use to get the logits on all the training set.



In [36]:
net = BinaryCLF(vocab_size=len(TEXT.vocab), embedding_dim=100, output_dim=1, pad_idx=TEXT.vocab.stoi[TEXT.pad_token])

criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=args_clf.learning_rate)

net.cuda()
net.train()

for epoch in range(10):
    train_loss=0
    step_num=0
    for batch, logits_bert in zip(train_iterator, logits_loader):
        net.zero_grad()
        logits_bert = torch.tensor(logits_bert).squeeze()
        output = net(batch.text[0]).squeeze(1)
        ############# IMPLEMENT DISTILLATION HERE ##################
        ############################################################
        # Use logits_bert
        loss = criterion(torch.sigmoid(output), batch.label.float())
        ############# IMPLEMENT DISTILLATION HERE ##################
        ############################################################
        loss.backward()
        train_loss += loss.item()
        nn.utils.clip_grad_norm_(net.parameters(), args_clf.clip_gradient_max_norm)
        optimizer.step()
        step_num+=1
    print(f'Epoch: {epoch}  --- loss:  {train_loss/(step_num + 1)}')


  


Epoch: 0  --- loss:  0.6508550924413344
Epoch: 1  --- loss:  0.6310171169393203
Epoch: 2  --- loss:  0.5791842727100148
Epoch: 3  --- loss:  0.48900363725774426
Epoch: 4  --- loss:  0.38708231554311867
Epoch: 5  --- loss:  0.2989107081118752
Epoch: 6  --- loss:  0.22979897611281452
Epoch: 7  --- loss:  0.17941275501952453
Epoch: 8  --- loss:  0.14212224807809382
Epoch: 9  --- loss:  0.1142382082693717


In [37]:
def get_test_score_binaryclf(model, test_iterator, args):
  s = time.time()
  model.eval()
  model.to(args.device)
  binary_clf_predicted = []
  with torch.no_grad():
      for batch in test_iterator:
          pb = model(batch.text[0].to("cuda")).squeeze(1)
          binary_clf_predicted += list(torch.sigmoid(pb.cpu().detach()).numpy() > 0.5)

  print("___")
  print("Time (in seconds) ________________",time.time() - s)
  print("___")
  print(classification_report(test_y, binary_clf_predicted))

get_test_score_binaryclf(net, test_iterator, args_clf)

___
Time (in seconds) ________________ 0.48473691940307617
___
              precision    recall  f1-score   support

       False       0.81      0.82      0.81      1466
        True       0.82      0.82      0.82      1534

    accuracy                           0.82      3000
   macro avg       0.82      0.82      0.82      3000
weighted avg       0.82      0.82      0.82      3000



# Quantization

In [0]:
import torch.quantization

def print_size_of_model(model):
    """""
    Get the size on disk of the model
    """""
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/1e6)
    os.remove('temp.p')


bert.eval()
bert.cpu()

# As simple as that:
quantized_bert = torch.quantization.quantize_dynamic(
    bert, {nn.Linear}, dtype=torch.qint8
)

- Let's see the size on disk

In [39]:
print_size_of_model(quantized_bert)

Size (MB): 181.426038


In [40]:
print_size_of_model(bert)

Size (MB): 437.977457


- Let's make a small dataset to compare the speed and the accuracy of both models

In [0]:
data_quantization_benchmark = [full_train_data[random.randint(1,24000)] for _ in range(100)]

quantize_texts = list(" ".join(data_quantization_benchmark[i].text) for i in range(len(data_quantization_benchmark))) 
quantize_labels = list(data_quantization_benchmark[i].label for i in range(len(data_quantization_benchmark)))
quantize_tokens = [['[CLS]'] + tokenizer.tokenize(t)[:args.max_seq_length] + ['[SEP]'] for t in  quantize_texts] # tokenize reviews in quantization dataset

quantize_tokens_ids = [tokenizer.convert_tokens_to_ids(review) for review in quantize_tokens] # wordpieces to ids
quantize_tokens_ids = pad_sequences(quantize_tokens_ids, maxlen=args.max_seq_length, truncating="post", padding="post", dtype="int")
quantize_labels = list(data_quantization_benchmark[i].label for i in range(len(data_quantization_benchmark)))

quantize_y = np.array(quantize_labels) == 'pos'
quantize_masks = [[float(token_id > 0) for token_id in sent_token_ids] for sent_token_ids in quantize_tokens_ids]

quantize_dataloader = get_dataloader_bert(quantize_tokens_ids, quantize_masks, quantize_y, batch_size=args.batch_size)

In [42]:
args.device="cpu"
get_test_scores(quantized_bert, quantize_dataloader, args, quantize_y)

Time (Seconds) ________________ 53.21387577056885
___
              precision    recall  f1-score   support

       False       0.35      0.33      0.34        39
        True       0.59      0.61      0.60        61

    accuracy                           0.50       100
   macro avg       0.47      0.47      0.47       100
weighted avg       0.50      0.50      0.50       100



In [43]:
get_test_scores(bert, quantize_dataloader, args , quantize_y)

Time (Seconds) ________________ 69.69603300094604
___
              precision    recall  f1-score   support

       False       0.29      0.31      0.30        39
        True       0.53      0.51      0.52        61

    accuracy                           0.43       100
   macro avg       0.41      0.41      0.41       100
weighted avg       0.44      0.43      0.43       100

