# Using FakeBERT Architecture for Sequence Classification

[Original FakeBERT Paper](https://link.springer.com/content/pdf/10.1007/s11042-020-10183-2.pdf)

FakeBERT Architecture:

![FakeBERT](fakebert.PNG "FakeBERT Architecture")

Summary:

Questions:



In [1]:
# Mount into drive

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/Colab\ Notebooks/dl_project

/content/drive/MyDrive/Colab Notebooks/dl_project


In [3]:
%ls

bert_cnn_test.ipynb  fakebert.PNG  requirements.txt  test.csv  train.csv


In [4]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 7.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 27.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [5]:
!pip install -r 'requirements.txt'

Collecting torchtext==0.8.1
  Downloading torchtext-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (7.0 MB)
[K     |████████████████████████████████| 7.0 MB 5.4 MB/s 
[?25hCollecting torch==1.7.1
  Downloading torch-1.7.1-cp37-cp37m-manylinux1_x86_64.whl (776.8 MB)
[K     |████████████████████████████████| 776.8 MB 15 kB/s 
[?25hCollecting spacy==2.3.5
  Downloading spacy-2.3.5-cp37-cp37m-manylinux2014_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 3.6 MB/s 
Collecting thinc<7.5.0,>=7.4.1
  Downloading thinc-7.4.5-cp37-cp37m-manylinux2014_x86_64.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 53.5 MB/s 
Installing collected packages: torch, thinc, torchtext, spacy
  Attempting uninstall: torch
    Found existing installation: torch 1.10.0+cu111
    Uninstalling torch-1.10.0+cu111:
      Successfully uninstalled torch-1.10.0+cu111
  Attempting uninstall: thinc
    Found existing installation: thinc 7.4.0
    Uninstalling thinc-7.4.0:
      Successfull

In [6]:
import logging
import time
from platform import python_version

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
from torch.autograd import Variable
from transformers import BertTokenizer, BertModel
from torch.optim import Adam, Adadelta
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from platform import python_version

In [7]:
print("python version==%s" % python_version())
print("pandas==%s" % pd.__version__)
print("numpy==%s" % np.__version__)
print("torch==%s" % torch.__version__)
print("sklearn==%s" % sklearn.__version__)
print("transformers==%s" % transformers.__version__)
print("matplotlib==%s" % matplotlib.__version__)

python version==3.7.13
pandas==1.3.5
numpy==1.21.5
torch==1.7.1
sklearn==1.0.2
transformers==4.18.0
matplotlib==3.2.2


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
# n_gpu = torch.cuda.device_count()
# torch.cuda.get_device_name(0)

device(type='cuda')

In [9]:
# inputs
train_fp = 'train.csv'
test_fp = 'test.csv'
pretrained_fp = 'bert-base-uncased'

### load data

In [10]:
train = pd.read_csv(train_fp, usecols=['text', 'label'])
test = pd.read_csv(test_fp, usecols=['text', 'label'])
df = pd.concat([train, test], ignore_index=True)
df.head()

Unnamed: 0,text,label
0,ATHENS (Reuters) - Turkish President Tayyip Er...,1
1,"Ted Cruz would be fair, honest and most of all...",0
2,WASHINGTON (Reuters) - White House Chief of St...,1
3,DUBAI (Reuters) - Saudi Arabia welcomed the ne...,1
4,"SIGONELLA, Italy (Reuters) - U.S. President Do...",1


In [11]:
print(f'full data: {df.shape[0]} rows, {df.shape[1]} features')

full data: 44898 rows, 2 features


In [12]:
# target skew?
df.label.value_counts(normalize=True)

0    0.522985
1    0.477015
Name: label, dtype: float64

In [13]:
# what text lengths (# characters)?
print(f'avg text length (chars): {df.text.str.len().mean():0.2f}')
print(f'median text length (chars): {df.text.str.len().median()}')
print(f'min text length (chars): {df.text.str.len().min():0.2f}')
print(f'max text length (chars): {df.text.str.len().max():0.2f}')

avg text length (chars): 2469.11
median text length (chars): 2186.0
min text length (chars): 1.00
max text length (chars): 51794.00


In [14]:
# what text lengths (# words)?
print(f'avg text length (words): {df.text.str.split().str.len().mean():0.2f}')
print(f'median text length (words): {df.text.str.split().str.len().median()}')
print(f'min text length (words): {df.text.str.split().str.len().min():0.2f}')
print(f'max text length (words): {df.text.str.split().str.len().max():0.2f}')

avg text length (words): 405.28
median text length (words): 362.0
min text length (words): 0.00
max text length (words): 8135.00


### preprocess data

In [15]:
# remove rows with fewer than 150 characters
min_chars = 150
df = df[df.text.str.len() > min_chars]
df.text.str.len().describe()

count    43595.000000
mean      2541.617938
std       2162.322732
min        151.000000
25%       1349.000000
50%       2231.000000
75%       3148.000000
max      51794.000000
Name: text, dtype: float64

In [16]:
# clip first 30 characters to eliminate location and source information
clip_idx = 30
df.text = df.text.str[clip_idx:]
df.head()

Unnamed: 0,text,label
0,sident Tayyip Erdogan said on Thursday that U....,1
1,"and most of all, he would follow the law. He ...",0
2,ouse Chief of Staff John Kelly’s comment that ...,1
3,welcomed the new U.S. policy toward Iran and ...,1
4,.S. President Donald Trump arrived in Sicily f...,1


In [17]:
# recheck data skew
# it's a little more balanced this way (for better or worse)
df.label.value_counts(normalize=True)

0    0.508751
1    0.491249
Name: label, dtype: float64

In [18]:
# only include file notes less than 500 words
max_words = 500
print(f'original # rows: {df.shape[0]}')
df = df[df.text.str.split().str.len() < max_words]
print(f'clipped data: {df.shape[0]} rows')
df.label.value_counts(normalize=True)

original # rows: 43595
clipped data: 31951 rows


0    0.509155
1    0.490845
Name: label, dtype: float64

In [19]:
df.text.str.split().str.len().describe()

count    31951.000000
mean       275.225877
std        134.219270
min         13.000000
25%        155.000000
50%        300.000000
75%        388.000000
max        499.000000
Name: text, dtype: float64

In [20]:
df.head()

Unnamed: 0,text,label
0,sident Tayyip Erdogan said on Thursday that U....,1
1,"and most of all, he would follow the law. He ...",0
2,ouse Chief of Staff John Kelly’s comment that ...,1
3,welcomed the new U.S. policy toward Iran and ...,1
4,.S. President Donald Trump arrived in Sicily f...,1


In [21]:
print(f'processed data: {df.shape[0]} rows, {df.shape[1]} features')

processed data: 31951 rows, 2 features


In [22]:
# start with smaller sample
# samp_pct = 0.1
# samp = df.sample(frac=samp_pct)
samp_size = 1000
samp = df.sample(n=samp_size)
samp.shape

(1000, 2)

In [23]:
samp.label.value_counts(normalize=True)

0    0.511
1    0.489
Name: label, dtype: float64

### get embeddings using BERT

Each file note becomes a 2D tensor:
* Each row is a token or subtoken in the sequence
* Each column is a value in the embedding (vector) for that token

In [24]:
tokenizer = BertTokenizer.from_pretrained(pretrained_fp)
bert_model = BertModel.from_pretrained(pretrained_fp)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
texts = samp.text
labels = samp.label

In [26]:
texts.head()

14848    liens cheerleader Rep. Luis Gutierrez: Every t...
8876      Melania commemorate the 9-11 terror attacks a...
36501    . Senate on Thursday blocked an effort to prev...
28735    eign Minister Sergei Lavrov urged world powers...
18738     John Bel Edwards on Tuesday urged state legis...
Name: text, dtype: object

In [27]:
# enc = tokenizer.encode(X_train.values[0], add_special_tokens=True)
enc = tokenizer.encode(texts.values[0], add_special_tokens=True)
print(f'encoded file note dimensions: {len(enc)}')
# enc

encoded file note dimensions: 363


In [29]:
max_seq_len = 200

def tokenize_text(text_arr, max_seq):
    return [tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in text_arr.values]

def pad_text(tokenized_text, max_seq):
    return np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])

def tokenize_and_pad_text(text_arr, max_seq):
    tokenized_text = tokenize_text(text_arr, max_seq)
    padded_text = pad_text(tokenized_text, max_seq)
    return torch.tensor(padded_text)

def targets_to_tensor(label_arr):
    return torch.tensor(label_arr.values, dtype=torch.long)

In [None]:
# sequence length for first text = 116
# encoded length = 134
# padded encoded shape = (1,134)
# tokenized and padded text should be shape (n_samples, max_seq_len)
# ex. for X_train, tokenized and padded text = (7609, 100)

In [None]:
ex = texts[:5]
# print(f'sequence length: {len(ex.split())}')
# enc = tokenizer.encode(ex, add_special_tokens=True)
# enc = np.array([el + [0] * (max_seq_len - len(el)) for el in [enc]])
enc = tokenize_and_pad_text(ex, max_seq_len)
enc.shape

torch.Size([5, 300])

In [30]:
# warning comes up because sequences are longer,
# but this function also clips them to max_seq_len,
# so it won't be a problem in the model
input_idxs = tokenize_and_pad_text(texts, max_seq_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (841 > 512). Running this sequence through the model will result in indexing errors


In [31]:
input_idxs.shape
# train_indices.shape

torch.Size([1000, 200])

In [32]:
# get contextualized embeddings from bert model
from time import time
start = time()
with torch.no_grad():
    bert_embeddings = bert_model(input_idxs)[0]
    # X_train_bert = bert_model(train_indices)[0]  # Models outputs are tuples
    # X_val_bert = bert_model(val_indices)[0]
    # X_test_bert = bert_model(test_indices)[0]
end = time()
elapsed = end - start
if elapsed < 180:
    print(f'code took {elapsed:0.2f} seconds to execute')
else:
    print(f'code took {elapsed / 60:0.2f} minutes to execute')

code took 8.82 minutes to execute


In [34]:
bert_embeddings.shape
# X_train_bert[0].shape

torch.Size([1000, 200, 768])

In [35]:
bert_labels = targets_to_tensor(labels)
# y_train_bert = targets_to_tensor(y_train)
# y_val_bert = targets_to_tensor(y_val)
# y_test_bert = targets_to_tensor(y_test)

In [36]:
bert_labels[0].dtype

torch.int64

### build cnn for classification

In [37]:
# https://chriskhanhtran.github.io/posts/cnn-sentence-classification/

In [38]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [39]:
bert_embeddings.shape

torch.Size([1000, 200, 768])

In [40]:
BATCH_SIZE = 64
N_SEQUENCES, NUM_TOKENS, EMBEDDING_SIZE = bert_embeddings.shape
# N_SEQUENCES = 500
# NUM_TOKENS = 50
# EMBEDDING_SIZE = 512
# FILTER_SIZES = [2, 3, 4]
# NUM_FILTERS = [3, 3, 3]
FILTER_SIZES = [128, 128, 128]
NUM_FILTERS = [3, 4, 5]
NUM_CLASSES = 2
DROPOUT = 0.2
LR = 0.1

In [None]:
# # randomly initialize tensor of embeddings for testing cnn architecture
# # in true model run, these will be contextualized embeddings from BERT
# X = torch.randn((N_SEQUENCES, NUM_TOKENS, EMBEDDING_SIZE))
# X.shape

In [None]:
# # randomly initialize tensor of labels for testing cnn architecture
# labels = torch.randint(0, 2, size=(X.shape[0],))
# labels.shape

In [41]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler,
                              SequentialSampler)

def get_data_loader(train_inputs, val_inputs, train_labels, val_labels,
                batch_size=64):
    """Convert train and validation sets to torch.Tensors and load them to
    DataLoader.
    """

    # Convert data type to torch.Tensor
    train_inputs, val_inputs, train_labels, val_labels =\
    tuple(torch.tensor(data) for data in
          [train_inputs, val_inputs, train_labels, val_labels])

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [42]:
from sklearn.model_selection import train_test_split

# Train Test Split
# train_inputs, val_inputs, train_labels, val_labels = train_test_split(
#     X, labels, test_size=0.1, random_state=42)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(bert_embeddings, bert_labels, test_size=0.1, random_state=42)

# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = get_data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=BATCH_SIZE)

  # This is added back by InteractiveShellApp.init_path()


In [43]:
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 200, 768])
Labels batch shape: torch.Size([64])


In [51]:
tens = torch.randn((2, 3, 8))
tens.shape

torch.Size([2, 3, 8])

In [57]:
(453 - 5 + 1) / 5

89.8

In [44]:
class FakeBERTCNN(nn.Module):
    # def __init__(self, pretrained_embedding, emb_dim, filter_sizes=[3, 4, 5], num_filters=[100, 100, 100], num_classes=2, dropout_p=0.2):
    def __init__(self, emb_dim, filter_sizes=[3, 4, 5], num_filters=[100, 100, 100], num_classes=2, dropout_p=0.2):
        super(FakeBERTCNN, self).__init__()

        # self.embedding = nn.Embedding.from_pretrained(pretrained_embedding)
        # maybe add embedding code here instead of earlier
        # getting embeddings for each batch (vs. full data 1st) could be more efficience for memory

        # CNN
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=emb_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        # Fully-connected layer and Dropout
        self.fc1 = nn.Linear(np.sum(num_filters), num_classes)

        # # add parameters for these layers
        # self.conv1d1 = nn.Conv1d()
        # self.pool1 = nn.MaxPool1d()
        # self.conv1d2 = nn.Conv1d()
        # self.pool2 = nn.MaxPool1d()
        # self.flat = nn.Flatten()
        # self.fc2 = nn.Linear()
        # self.fc3 = nn.Linear()

        self.dropout = nn.Dropout(p=dropout_p)
        self.relu = nn.ReLU()

    # def forward(self, input_ids):
    def forward(self, x_embed):
            # # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
            # x_embed = self.embedding(input_ids).float()

            # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
            # Output shape: (b, embed_dim, max_len)
            x_reshaped = x_embed.permute(0, 2, 1)

            # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
            x_conv_list = [self.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]

            # Max pooling. Output shape: (b, num_filters[i], 1)
            x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
                for x_conv in x_conv_list]
            
            # Concatenate x_pool_list to feed the fully connected layer.
            # Output shape: (b, sum(num_filters))
            x_fc1 = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                            dim=1)
            
            # Compute logits. Output shape: (b, n_classes)
            logits = self.fc1(self.dropout(x_fc1))

            return logits

In [45]:
# def initialize_model(pretrained_embedding, embed_dim, filter_sizes, num_filters,
#                     num_classes, dropout=0.2, learning_rate=0.01):
def initialize_model(embed_dim, filter_sizes, num_filters, num_classes,
                     dropout=0.2, learning_rate=0.01):
    """Instantiate a CNN model and an optimizer."""

    assert (len(filter_sizes) == len(num_filters)), "filter_sizes and \
    num_filters need to be of the same length."

    # Instantiate CNN model
    cnn_model = FakeBERTCNN(emb_dim=embed_dim,
                            filter_sizes=filter_sizes,
                            num_filters=num_filters,
                            num_classes=num_classes,
                            dropout_p=dropout)
    
    # Send model to `device` (GPU/CPU)
    cnn_model.to(device)

    # Instantiate Adadelta optimizer
    # optimizer = Adadelta(cnn_model.parameters(), lr=learning_rate, rho=0.95)
    optimizer = Adam(cnn_model.parameters(), lr=learning_rate)

    return cnn_model, optimizer

### run training data through BERT and CNN

In [46]:
import random
import time

# Specify loss function
loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility."""

    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, optimizer, train_dataloader, val_dataloader=None, epochs=5):
    """Train the CNN model."""
    
    # Tracking best validation accuracy
    best_accuracy = 0

    # Start training loop
    print("Start training...\n")
    print(f"{'Epoch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {\
    'Val Acc':^9} | {'Elapsed':^9}")
    print("-"*60)

    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================

        # Tracking time and loss
        t0_epoch = time.time()
        total_loss = 0

        # Put the model into the training mode
        model.train()

        for step, batch in enumerate(train_dataloader):
            # Load batch to GPU
            b_input_ids, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Update parameters
            optimizer.step()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        # =======================================
        #               Evaluation
        # =======================================
        if val_dataloader is not None:
            # After the completion of each training epoch, measure the model's
            # performance on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Track the best accuracy
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {avg_train_loss:^12.6f} | {\
            val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            
    print("\n")
    print(f"Training complete! Best accuracy: {best_accuracy:.2f}%.")

def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's
    performance on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled
    # during the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids)

        # Compute loss
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()

        # Calculate the accuracy rate
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [47]:
set_seed(42)
cnn_bert, optimizer = initialize_model(embed_dim=EMBEDDING_SIZE, filter_sizes=FILTER_SIZES,
                                       num_filters=NUM_FILTERS, num_classes=NUM_CLASSES,
                                       learning_rate=LR, dropout=DROPOUT)
train(cnn_bert, optimizer, train_dataloader, val_dataloader, epochs=10)

Start training...

 Epoch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
------------------------------------------------------------
   1    |  69.864890   |  0.694186  |   53.12   |   0.90   
   2    |   0.695972   |  0.694529  |   46.88   |   0.66   
   3    |   0.691898   |  0.691278  |   53.12   |   0.65   
   4    |   0.697111   |  0.697022  |   46.88   |   0.66   
   5    |   0.703107   |  0.695680  |   46.88   |   0.66   
   6    |   0.690975   |  0.691384  |   53.12   |   0.66   
   7    |   0.699163   |  0.693581  |   46.88   |   0.66   
   8    |   0.697821   |  0.693864  |   46.88   |   0.66   
   9    |   0.694450   |  0.693360  |   46.88   |   0.66   
  10    |   0.694108   |  0.694379  |   46.88   |   0.66   


Training complete! Best accuracy: 53.12%.


### evaluate model performance on test data

In [58]:
evaluate(cnn_bert, val_dataloader)

(0.6943790316581726, 46.875)

### References

https://romanorac.github.io/machine/learning/2019/12/02/identifying-hate-speech-with-bert-and-cnn.html