In [45]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from pandas import read_parquet, DataFrame, Series, concat
from data import file
from tqdm import tqdm
from preprocessing.categorical import binarizer

In [29]:
# apply progress bar on pandas operations
tqdm.pandas()

In [30]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

In [31]:
data_train = read_parquet(file.news_articles_cleaned_train)
data_test = read_parquet(file.news_articles_cleaned_test)
data_val = read_parquet(file.news_articles_cleaned_val)

In [33]:
print("train size:", data_train.shape, "categories:", len(data_train.label.unique()))
print("test size :", data_test.shape, "categories:", len(data_test.label.unique()))
print("val size  :", data_val.shape, "categories:", len(data_val.label.unique()))

train size: (7191, 9) categories: 9
test size : (2054, 9) categories: 9
val size  : (1028, 9) categories: 9


In [34]:
MAXLEN = 192

def tokenize(review):

  encoded = tokenizer.encode_plus(
      text=review,
      add_special_tokens=True,     # Add `[CLS]` and `[SEP]`
      max_length=MAXLEN,           # Max length to truncate/pad
      padding='max_length',        # Pad sentence to max length
      return_attention_mask=False, # attention mask not needed for our task
      return_token_type_ids=False,
      truncation=True)
  return encoded['input_ids']

In [35]:
data_hf_tokenized_train = concat([data_train, data_train.text_original.progress_map(tokenize).rename('hf_tokenized')], axis=1)
data_hf_tokenized_train.to_parquet(path=file.news_articles_hf_tokenized_train)

100%|██████████| 7191/7191 [00:52<00:00, 135.87it/s]


In [36]:
data_hf_tokenized_test = concat([data_test, data_test.text_original.progress_map(tokenize).rename('hf_tokenized')], axis=1)
data_hf_tokenized_test.to_parquet(path=file.news_articles_hf_tokenized_test)

100%|██████████| 2054/2054 [00:15<00:00, 131.58it/s]


In [37]:
data_hf_tokenized_val = concat([data_val, data_val.text_original.progress_map(tokenize).rename('hf_tokenized')], axis=1)
data_hf_tokenized_val.to_parquet(path=file.news_articles_hf_tokenized_val)

100%|██████████| 1028/1028 [00:07<00:00, 131.21it/s]


In [41]:
hf_data_train = read_parquet(file.news_articles_hf_tokenized_train)
hf_data_test = read_parquet(file.news_articles_hf_tokenized_test)
hf_data_val = read_parquet(file.news_articles_hf_tokenized_val)

print("train size:", hf_data_train.shape, "categories:", len(hf_data_train.label.unique()))
print("test size :", hf_data_test.shape, "categories:", len(hf_data_test.label.unique()))
print("val size  :", hf_data_val.shape, "categories:", len(hf_data_val.label.unique()))

train size: (7191, 10) categories: 9
test size : (2054, 10) categories: 9
val size  : (1028, 10) categories: 9


In [52]:
label_binarizer=binarizer(hf_data_train.label)
label_bin_train = label_binarizer.transform(hf_data_train.label)
label_bin_test = label_binarizer.transform(hf_data_test.label)

print("train label size:", label_bin_train.shape, "categories:", len(hf_data_train.label.unique()))
print("test label size :", label_bin_test.shape, "categories:", len(hf_data_test.label.unique()))

train label size: (7191, 9) categories: 9
test label size : (2054, 9) categories: 9


In [152]:
BATCH_SIZE = 16
EPOCHS = 8
LEARNING_RATE = 1e-5

train_dataset = (tf.data.Dataset.from_tensor_slices((hf_data_train.hf_tokenized.map(lambda x:x.tolist()).tolist(), label_bin_train))
                .batch(BATCH_SIZE)
                .prefetch(tf.data.AUTOTUNE))

test_dataset = (tf.data.Dataset.from_tensor_slices((hf_data_test.hf_tokenized.map(lambda x:x.tolist()).tolist(), label_bin_test))
                .batch(BATCH_SIZE)
                .prefetch(tf.data.AUTOTUNE))


In [147]:
def build_model(output_classes, max_len=MAXLEN):
    """ add binary classification to pretrained model
    """

    input_word_ids = tf.keras.layers.Input(
        shape=(max_len,), dtype=tf.int32, name="input_word_ids"
    )

    bert_model = TFBertModel.from_pretrained("bert-base-german-cased")
    encoder_outputs = bert_model(input_word_ids)

    pooler_output = encoder_outputs[1]
    cls_embedding = pooler_output

    stack = tf.keras.layers.Dense(output_classes)(cls_embedding)
    output = tf.keras.layers.Activation('softmax')(stack)

    ##########################
    ## YOUR CODE HERE END ##
    ##########################

    model = tf.keras.models.Model(inputs=input_word_ids, outputs=output)
    
    return model


In [149]:
model = build_model(len(data_train.label.unique()), max_len=MAXLEN)
model.summary()

Some layers from the model checkpoint at bert-base-german-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-german-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_bert_model_2 (TFBertModel TFBaseModelOutputWithPool 109081344 
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 6921      
_________________________________________________________________
activation_1 (Activation)    (None, 9)                 0         
Total params: 109,088,265
Trainable params: 109,088,265
Non-trainable params: 0
_________________________________________________________________


In [151]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = loss="binary_crossentropy"

model.compile(optimizer, loss=loss, metrics=["accuracy"])

In [153]:
np.floor((len(hf_data_train) / BATCH_SIZE))

NameError: name 'np' is not defined

In [None]:
hist = model.fit(
    train_dataset,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    steps_per_epoch=int(np.floor((len(hf_data_train) / BATCH_SIZE))),
    validation_data=test_dataset,
    verbose=1,
    callbacks=[
               tf.keras.callbacks.EarlyStopping(monitor="val_loss", verbose=1, patience=1, restore_best_weights=True),
               tf.keras.callbacks.TensorBoard(f'logs/{datetime.now()}')
               ],
)