In [12]:
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


Reference from Hugging Face  
https://huggingface.co/course/chapter3/3?fw=tf

In [13]:
df = pd.read_csv("data/covid19_articles_20201231_reduced.csv")


In [14]:
df.head()


Unnamed: 0.1,Unnamed: 0,content,topic_area
0,28241,The coronavirus crisis has almost certainly en...,business
1,210240,Latest Report Shows a 15.3% Week-Over-Week Dec...,business
2,77957,FORESIGHT VCT PLC (Company) Publication of Sup...,business
3,207961,Technavio has been monitoring the global mater...,business
4,252956,Outdoor pop-up classes will be held in parks a...,business


In [15]:
## https://stackoverflow.com/questions/44114463/stratified-sampling-in-pandas
def stratified_sample_df(df, col, n_samples, random_state=69):
    n = min(n_samples, df[col].value_counts().min())
    df_ = df.groupby(col).apply(lambda x: x.sample(n, random_state=random_state))
    df_.index = df_.index.droplevel(0)
    return df_


In [16]:
df = stratified_sample_df(df, "topic_area", 10)
df.info()


In [17]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncate=True, max_length=100)
sequences = df["content"].tolist()
batch = dict(tokenizer(sequences, padding=True, truncation=True, return_tensors="tf"))


In [18]:
print(
    f"""
Sample sentence:    {df["content"].tolist()[0][:100]}...
input ids:          {batch["input_ids"][0][:100]}...
token type ids:     {batch["token_type_ids"][0][:100]}...
attention mask:     {batch["attention_mask"][0][:100]}...
"""
)



Sample sentence:     - Craft beer, live music and lodging featured in renovation plans that Takuya Shimbo had for an agi...
input ids:          [  101  1011  7477  5404  1010  2444  2189  1998 26859  2956  1999 10525
  3488  2008 27006 26230  2050 11895 13344  2018  2005  2019 12520  5522
  7198  4580  1010  5327  2000  5343  1037 14059  3068  2013 14446  2011
 27788 26703  1996  4145  1997 15029 17573  1012  2059  1996  3117 21887
 23350  4930  1012  1996  2231  8357  2900  1005  1055  2261  3588  7198
 15666  2004  4187  2005  2270 19548  2061  2009  7303  2027  2994  2330
  1010  2096  2012  1996  2168  2051 11434  2111  2000  2994  2012  2188
  2076  1037  2110  1997  5057  2000  4652  1996  3659  1997  2522 17258
  1011  2539  2029  2038]...
token type ids:     [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]...
attention mask:

In [19]:
topic_area_map = {k: i for k, i in enumerate(df["topic_area"].unique())}
reversed_topic_area_map = {v: k for k, v in topic_area_map.items()}
labels = tf.convert_to_tensor(
    [reversed_topic_area_map[k] for k in df["topic_area"].tolist()]
)


In [20]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=len(df.topic_area.unique())
)

model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

hist = model.fit(
    x=batch,
    y=labels,
    batch_size=20,
    epochs=5,
    verbose=1,
    validation_split=0.2,
)


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [23]:
BERT_MODEL_FILEPATH = "saved_models/bert"
model.save(BERT_MODEL_FILEPATH)
model = tf.keras.models.load_model(BERT_MODEL_FILEPATH)


2022-03-30 12:04:50.061784: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: saved_models/bert/assets


INFO:tensorflow:Assets written to: saved_models/bert/assets


In [21]:
test = ["this is a super test", "this is a boring article"]
batch = dict(tokenizer(test, padding=True, truncation=True, return_tensors="tf"))
y_pred = model.predict(batch)


In [22]:
y_pred.to_tuple()[0].tolist()


[[-0.8829320669174194,
  1.4541959762573242,
  1.1529698371887207,
  3.1338181495666504,
  2.0458924770355225,
  -0.9047433733940125,
  -1.2239004373550415],
 [-0.8821392059326172,
  1.4532511234283447,
  1.1529271602630615,
  3.132197380065918,
  2.0450141429901123,
  -0.9033188819885254,
  -1.2226083278656006]]