# Transfer Learning with BERT

References:

- https://www.tensorflow.org/text/tutorials/classify_text_with_bert
- https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [300]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import (
    BertTokenizer,
    TFBertModel,
    AutoTokenizer,
    BertTokenizerFast,
    AutoModel,
    DistilBertTokenizerFast,
)
from keras.layers import Input, GlobalAveragePooling1D, Dense, Dropout
from keras.models import Model
import regex as re
import tensorflow_hub as hub


In [301]:
df = pd.read_csv("data/covid19_articles_20201231_reduced.csv")

In [302]:
df.head()


Unnamed: 0.1,Unnamed: 0,content,topic_area
0,28241,The coronavirus crisis has almost certainly en...,business
1,210240,Latest Report Shows a 15.3% Week-Over-Week Dec...,business
2,77957,FORESIGHT VCT PLC (Company) Publication of Sup...,business
3,207961,Technavio has been monitoring the global mater...,business
4,252956,Outdoor pop-up classes will be held in parks a...,business


In [303]:
# DELETE ME
_, X_train, __, y_train = train_test_split(
    df["content"],
    df["topic_area"],
    test_size=0.005,
    stratify=df["topic_area"],
    random_state=69,
)


In [304]:
## lightweight version of bert
## https://huggingface.co/docs/transformers/preprocessing
BERT_MODEL_NAME = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(
    BERT_MODEL_NAME, do_lower_case=True
)


In [305]:
MAX_LENGTH = 100


def bert_encode(corpus, max_length):
    res = [[], [], []]
    for sentence in corpus:
        sentence = (
            ["[CLS]"]
            + [
                " ".join(tokenizer.tokenize(x))
                for x in (re.sub(r"[^\w\s]+|\n", "", sentence))
                .lower()
                .strip()
                .split(" ")
                if x.strip() != ""
            ][: max_length // 4]
            + ["[SEP]"]
        )

        tokens = [
            inner
            for outer in [tokenizer.encode(x)[1:-1] for x in sentence]
            for inner in outer
        ]

        ## padding
        tokens = tokens + [0] * max(0, max_length - len(tokens))

        mask = []
        token_type_id = []
        m, s = 1, 0
        for t in tokens:
            if t == 102:
                m, s = 0, 1
            mask.append(m)
            token_type_id.append(s)
        res[0].append(np.array(tokens))
        res[1].append(np.array(mask))
        res[2].append(np.array(token_type_id))
    return np.array([np.array(x) for x in res])


In [306]:
X_train_encoded = bert_encode(X_train, MAX_LENGTH)


In [313]:
X_train_encoded.shape

(3, 236, 100)

In [307]:
maxLengths = [0, 0, 0]
for i in range(len(X_train_encoded)):
    for j in range(3):
        maxLengths[j] = max(maxLengths[j], len(X_train_encoded[j][i]))
maxLengths

[100, 100, 100]

In [326]:
idx = Input((MAX_LENGTH,), dtype="int32")
masks = Input((MAX_LENGTH,), dtype="int32")
token_type_ids = Input((MAX_LENGTH,), dtype="int32")

custom_input = [idx, masks, token_type_ids]

bert_model = TFBertModel.from_pretrained(BERT_MODEL_NAME)
bert_output_layer = bert_model(custom_input)

layer = GlobalAveragePooling1D()(bert_output_layer["last_hidden_state"])
layer = Dropout(0.2)(layer)
layer = Dense(64, activation="relu")(layer)
output_layer = Dense(len(np.unique(y_train)), activation="softmax")(layer)

model = Model(custom_input, output_layer)

for layer in model.layers[:4]:
    layer.trainable = False

model.compile(
    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

model.summary()


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_55 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 input_56 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 input_57 (InputLayer)          [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_model_18 (TFBertModel)  TFBaseModelOutputWi  109482240  ['input_55[0][0]',               
                                thPoolingAndCrossAt               'input_56[0][0]',         

In [327]:
dic_y_mapping = {n: label for n, label in enumerate(np.unique(y_train))}
inverse_dic = {v: k for k, v in dic_y_mapping.items()}
y_train_dummy = np.array([inverse_dic[y] for y in y_train])


In [329]:
y_train_dummy.shape

(236,)

In [330]:
hist = model.fit(x=X_train_encoded, y=y_train_dummy, batch_size=10, epochs=1)


ValueError: Data cardinality is ambiguous:
  x sizes: 3
  y sizes: 236
Make sure all arrays contain the same number of samples.