In [7]:
import tensorflow as tf
from keras.layers import Dense, GlobalAveragePooling1D, Dropout, Input, Flatten
import numpy as np
import pandas as pd
from transformers import (
    AutoTokenizer,
    TFBertModel,
    TFAutoModelForSequenceClassification,  ## https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#tfautomodelforsequenceclassification
    TFBertForSequenceClassification,  ## https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertForSequenceClassification
    AdamW,
)
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


Reference from Hugging Face  
https://huggingface.co/course/chapter3/3?fw=tf


In [8]:
df = pd.read_csv("data/covid19_articles_20201231_reduced.csv")


In [9]:
df.head()


Unnamed: 0.1,Unnamed: 0,content,topic_area
0,28241,The coronavirus crisis has almost certainly en...,business
1,210240,Latest Report Shows a 15.3% Week-Over-Week Dec...,business
2,77957,FORESIGHT VCT PLC (Company) Publication of Sup...,business
3,207961,Technavio has been monitoring the global mater...,business
4,252956,Outdoor pop-up classes will be held in parks a...,business


In [10]:
STOP_WORDS = set(nltk.corpus.stopwords.words("english"))


def clean_text(text, stemm=False, lemm=True):
    text = re.sub(r"[^a-z\s]", "", text.lower().strip())
    text = [x for x in text.split() if x not in STOP_WORDS]

    if stemm:
        stemmer = PorterStemmer()
        text = [stemmer.stem(x) for x in text]

    if lemm:
        lemmatizer = WordNetLemmatizer()
        text = [lemmatizer.lemmatize(x) for x in text]

    return " ".join(text)


In [11]:
## https://stackoverflow.com/questions/44114463/stratified-sampling-in-pandas
def stratified_sample_df(df, col, n_samples, random_state=69):
    n = min(n_samples, df[col].value_counts().min())
    df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=random_state))
    df_.index = df_.index.droplevel(0)
    return df_


In [12]:
df = stratified_sample_df(df, "topic_area", 100)
df["content"] = df["content"].apply(lambda x: clean_text(x))
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 5256 to 18168
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  700 non-null    int64 
 1   content     700 non-null    object
 2   topic_area  700 non-null    object
dtypes: int64(1), object(2)
memory usage: 21.9+ KB


In [13]:
df['content'].head()

5256    craft beer live music lodging featured renovat...
7272    flight canceled around world bar restaurant sh...
1323    million american filed initial unemployment cl...
4924    q mediobanca banca di credito finanziario spa ...
5845    agence francepressegetty image covid may disru...
Name: content, dtype: object

In [14]:
BERT_MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME, truncate=True)
sequences = df["content"].tolist()
batch = dict(tokenizer(sequences, padding=True, truncation=True, return_tensors="tf"))


2022-03-30 20:27:52.166320: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
TENSOR_LENGTH = 0
for i, r in enumerate(batch["input_ids"]):
    TENSOR_LENGTH = max(len(r), TENSOR_LENGTH)
print(TENSOR_LENGTH)


512


In [16]:
print(
    f"""
Sample sentence:    {df["content"].tolist()[0][:100]}...
input ids:          {batch["input_ids"][0][:100]}...
token type ids:     {batch["token_type_ids"][0][:100]}...
attention mask:     {batch["attention_mask"][0][:100]}...
"""
)



Sample sentence:    craft beer live music lodging featured renovation plan takuya shimbo aging tokyo bathhouse hoping re...
input ids:          [  101  7477  5404  2444  2189 26859  2956 10525  2933 27006 26230  2050
 11895 13344 12520  5522  7198  4580  5327  5343 14059  3068 14446 27788
 26703  4145 15029 17573  3117 21887 23350  4930  2231  8357  2900  3588
  7198  4580  4187  2270 19548  7303  2994  2330  2051 11434  2111  2994
  2188  2110  5057  4652  3659  2522 17258  2730  2105  2887  3465  2066
  3095 10808  2123  2102  2689  2521  8491  8013  2056 11895 13344  2095
 11614  2353  6914 16754  3954 18765  3683  5283 10513  8763  5522 25912
  8013  2247 10961  9535  3436 17738 26477  2178 11703  2890 23270  7198
  4580 10971  2404  3653]...
token type ids:     [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]...
attention mask:

In [17]:
topic_area_map = {k: i for k, i in enumerate(df["topic_area"].unique())}
reversed_topic_area_map = {v: k for k, v in topic_area_map.items()}
labels = tf.convert_to_tensor(
    [reversed_topic_area_map[k] for k in df["topic_area"].tolist()]
)


In [18]:
model = TFBertForSequenceClassification.from_pretrained(
    BERT_MODEL_NAME, num_labels=len(df.topic_area.unique())
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
hist = model.fit(x=batch, y=labels, batch_size=10, epochs=10, verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
BERT_MODEL_FILEPATH = "saved_models/bert"
model.save(BERT_MODEL_FILEPATH)
model = tf.keras.models.load_model(BERT_MODEL_FILEPATH)


2022-03-31 00:00:08.710601: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: saved_models/bert/assets


INFO:tensorflow:Assets written to: saved_models/bert/assets


In [21]:
test = ["this is a super test", "this is a boring article"]
batch = dict(tokenizer(test, padding=True, truncation=True, return_tensors="tf"))
y_pred = model.predict(batch)


ValueError: in user code:

    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "/usr/local/lib/python3.9/site-packages/keras/engine/training.py", line 1751, in predict_step
        return self(x, training=False)
    File "/usr/local/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "tf_bert_for_sequence_classification" (type TFBertForSequenceClassification).
    
    Could not find matching concrete function to call loaded from the SavedModel. Got:
      Positional arguments (11 total):
        * {'input_ids': <tf.Tensor 'input_ids_1:0' shape=(None, 7) dtype=int32>, 'token_type_ids': <tf.Tensor 'input_ids_2:0' shape=(None, 7) dtype=int32>, 'attention_mask': <tf.Tensor 'input_ids:0' shape=(None, 7) dtype=int32>}
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * False
      Keyword arguments: {}
    
     Expected these arguments to match one of the following 2 option(s):
    
    Option 1:
      Positional arguments (11 total):
        * {'input_ids': TensorSpec(shape=(None, 5), dtype=tf.int32, name='input_ids/input_ids')}
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * False
      Keyword arguments: {}
    
    Option 2:
      Positional arguments (11 total):
        * {'input_ids': TensorSpec(shape=(None, 5), dtype=tf.int32, name='input_ids/input_ids')}
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * None
        * True
      Keyword arguments: {}
    
    Call arguments received:
      • args=({'input_ids': 'tf.Tensor(shape=(None, 7), dtype=int32)', 'token_type_ids': 'tf.Tensor(shape=(None, 7), dtype=int32)', 'attention_mask': 'tf.Tensor(shape=(None, 7), dtype=int32)'},)
      • kwargs={'training': 'False'}


In [None]:
y_pred.to_tuple()[0].tolist()


[[-0.8829320669174194,
  1.4541959762573242,
  1.1529698371887207,
  3.1338181495666504,
  2.0458924770355225,
  -0.9047433733940125,
  -1.2239004373550415],
 [-0.8821392059326172,
  1.4532511234283447,
  1.1529271602630615,
  3.132197380065918,
  2.0450141429901123,
  -0.9033188819885254,
  -1.2226083278656006]]