# Transfer Learning with BERT

References:

- https://www.tensorflow.org/text/tutorials/classify_text_with_bert
- https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f


In [45]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from transformers import (
    BertTokenizer,
    TFBertModel,
    AutoTokenizer,
    BertTokenizerFast,
    AutoModel,
    DistilBertTokenizerFast,
)
from keras.layers import Input, GlobalAveragePooling2D, Dense
import regex as re
import tensorflow_hub as hub


In [46]:
df = pd.read_csv("data/covid19_articles_20201231_reduced.csv")

In [47]:
df.head()


Unnamed: 0.1,Unnamed: 0,content,topic_area
0,28241,The coronavirus crisis has almost certainly en...,business
1,210240,Latest Report Shows a 15.3% Week-Over-Week Dec...,business
2,77957,FORESIGHT VCT PLC (Company) Publication of Sup...,business
3,207961,Technavio has been monitoring the global mater...,business
4,252956,Outdoor pop-up classes will be held in parks a...,business


In [64]:
# DELETE ME
_, X_train, __, y_train = train_test_split(
    df["content"],
    df["topic_area"],
    test_size=0.005,
    stratify=df["topic_area"],
    random_state=69,
)


In [49]:
## lightweight version of bert
## https://huggingface.co/docs/transformers/preprocessing
BERT_MODEL_NAME = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(
    BERT_MODEL_NAME, do_lower_case=True, padding=True, truncation=True
)


In [235]:
MAX_LENGTH = 100


def bert_encode(corpus, max_length):
    res = [[], [], []]
    for sentence in corpus:
        sentence = (
            ["[CLS]"]
            + [
                " ".join(tokenizer.tokenize(x))
                for x in (re.sub(r"[^\w\s]+|\n", "", sentence))
                .lower()
                .strip()
                .split(" ")
                if x.strip() != ""
            ][: max_length // 4]
            + ["[SEP]"]
        )

        tokens = [
            inner
            for outer in [tokenizer.encode(x)[1:-1] for x in sentence]
            for inner in outer
        ]

        ## padding
        tokens = tokens + [0] * max(0, max_length - len(tokens))

        mask = []
        segment = []
        m, s = 1, 0
        for t in tokens:
            if t == 102:
                m, s = 0, 1
            mask.append(m)
            segment.append(s)
        res[0].append(tokens)
        res[1].append(mask)
        res[2].append(segment)
    return res


In [236]:
X_train_encoded = bert_encode(X_train, MAX_LENGTH)


In [237]:
maxLengths = [0, 0, 0]
for i in range(len(X_train_encoded)):
    for j in range(3):
        maxLengths[j] = max(maxLengths[j], len(X_train_encoded[j][i]))
maxLengths

[100, 100, 100]