In [148]:
PATH_TO_DATASET_FOLDER = "C:\\Users\\Vojta\\Desktop\\diploma\\data"

In [149]:
from enum import Enum
import os.path
import tensorflow as tf
import pandas as pd
from tqdm import tqdm

In [150]:
class DataSet(Enum):
    Gutenberg = "gutenberg"

In [151]:
class DataSetType(Enum):
    Sentence = "Sentence"
    Article = "Article"

In [152]:
DATA_NAME = 'data.csv'

In [153]:
def create_path(directory, dataset, dataset_type, k=None):
    is_sentence_type = dataset_type == DataSetType.Sentence 
    if is_sentence_type and k is None:
        raise Exception(f"Sentence should be specified with k argument!")
    
    return os.path.join(directory, dataset.value, dataset_type.value + str(k), DATA_NAME) if is_sentence_type else os.path.join(directory, dataset.value, dataset_type.value, DATA_NAME)

In [154]:
create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 10)

'C:\\Users\\Vojta\\Desktop\\diploma\\data\\gutenberg\\Sentence10\\data.csv'

In [155]:
def process_text(text):
    return text.lower()

In [156]:
def get_dataset_object_from_path(csv_filename, delim, text_pipeline_func=None):
    dataset = tf.data.TextLineDataset(filenames=csv_filename)
    
    def parse_csv(line):
        csv_line = bytes.decode(line.numpy())
        text, author = csv_line.split(delim)
        if text_pipeline_func is not None:
            text = text_pipeline_func(text)
        return text, author 

    dataset = dataset.map(lambda tpl: tf.py_function(parse_csv, [tpl], [tf.string, tf.string]))
    return dataset

In [157]:
ds = get_dataset_object_from_path(create_path(PATH_TO_DATASET_FOLDER, DataSet.Gutenberg, DataSetType.Sentence, 3), ';', process_text)

# BERT testing

https://github.com/huggingface/transformers/tree/master/examples/tensorflow/text-classification

In [158]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import load_dataset
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import create_optimizer

In [159]:
import transformers
import logging

In [160]:
for line in ds:
    print(line)
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b"the tragedy of pudd'nhead wilson  by mark twain     a whisper to the reader       _there is no character, howsoever good and fine, but it can      be destroyed by ridicule, howsoever poor and witless. observe the ass, for instance: his character is about      perfect, he is the choicest spirit among all the humbler      animals, yet see what ridicule has brought him to. instead      of feeling complimented when we are called an ass, we are      left in doubt._ --pudd'nhead wilson's calendar  a person who is ignorant of legal matters is always liable to make mistakes when he tries to photograph a court scene with his pen and so i was not willing to let the law chapters in this book go to press without first subjecting them to rigid and exhausting revision and correction by a trained barrister--if that is what they are called.">, <tf.Tensor: shape=(), dtype=string, numpy=b'Twain, Mark'>)


In [161]:
imdb = load_dataset("imdb")



  0%|          | 0/3 [00:01<?, ?it/s]

In [162]:
imdb["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [163]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding='max_length', max_length=512)

In [164]:
tokenized_imdb = imdb.map(preprocess_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [165]:
small_train_dataset = tokenized_imdb["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_imdb["test"].shuffle(seed=42).select(range(1000))



full_train_dataset = tokenized_imdb["train"]
full_eval_dataset = tokenized_imdb["test"]

In [166]:
for x in small_train_dataset:
    print(x)
    break

{'text': 'There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier\'s plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it\'s the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...', 'label': 1, 'input_ids': [101, 1247, 1110, 1185, 6796, 1120, 1155, 1206, 3144, 2852, 1105, 26890, 1197, 1133, 1103, 1864, 1115, 1241, 1132, 2021, 1326, 1164, 5973, 6969, 119, 26890, 1197, 2736, 19501, 1183, 117, 3144, 2852, 273

In [167]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)


tf_train_dataset = small_train_dataset.remove_columns(["text"]).with_format("tensorflow")
tf_eval_dataset = small_eval_dataset.remove_columns(["text"]).with_format("tensorflow")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [168]:
for x in tf_train_dataset:
    print(x)
    break

{'label': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'input_ids': <tf.Tensor: shape=(512,), dtype=int64, numpy=
array([  101,  1247,  1110,  1185,  6796,  1120,  1155,  1206,  3144,
        2852,  1105, 26890,  1197,  1133,  1103,  1864,  1115,  1241,
        1132,  2021,  1326,  1164,  5973,  6969,   119, 26890,  1197,
        2736, 19501,  1183,   117,  3144,  2852,  2736,  5263,   119,
       26890,  1197, 15836,  1132,  2385,  3014,   119,  3144,  2852,
         112,   188,  4928,  1132,  1677,  1167,  8277,   119,   119,
         119,  3144,  2852,  2736,  1167,  1176,  3460, 15463, 20629,
         117,  1191,  1195,  1138,  1106,  3205, 12672,   119,   119,
         119,  1109,  1514,  1959,  1110,  4780,  1105,  6994,  1186,
         117,  1133,  1138,   107,   172, 20293, 12716,  3923,   107,
         119,  2563,  1176,  1106, 14133,   117,  1106,  3942,   117,
        1106, 17459,   119,  1731,  1164,  1198,  8965,   136, 16819,
        1645,  1315,   117,  1234,  2269,  314

In [169]:
train_features = {x: tf.cast(tf_train_dataset[x], dtype=tf.int32) for x in tokenizer.model_input_names}
train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))


# train_tf_dataset = tf_train_dataset.shuffle(len(tf_train_dataset)).batch(8)

eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
# eval_tf_dataset = eval_tf_dataset.batch(8)

In [170]:
for x in train_tf_dataset:
    print(x)
    break

({'input_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=
array([  101,  1247,  1110,  1185,  6796,  1120,  1155,  1206,  3144,
        2852,  1105, 26890,  1197,  1133,  1103,  1864,  1115,  1241,
        1132,  2021,  1326,  1164,  5973,  6969,   119, 26890,  1197,
        2736, 19501,  1183,   117,  3144,  2852,  2736,  5263,   119,
       26890,  1197, 15836,  1132,  2385,  3014,   119,  3144,  2852,
         112,   188,  4928,  1132,  1677,  1167,  8277,   119,   119,
         119,  3144,  2852,  2736,  1167,  1176,  3460, 15463, 20629,
         117,  1191,  1195,  1138,  1106,  3205, 12672,   119,   119,
         119,  1109,  1514,  1959,  1110,  4780,  1105,  6994,  1186,
         117,  1133,  1138,   107,   172, 20293, 12716,  3923,   107,
         119,  2563,  1176,  1106, 14133,   117,  1106,  3942,   117,
        1106, 17459,   119,  1731,  1164,  1198,  8965,   136, 16819,
        1645,  1315,   117,  1234,  2269,  3144,  2852,  2736,  1237,
        1133,   117,  1113,  

In [171]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy'],
)

In [172]:
model.summary()

Model: "tf_bert_for_sequence_classification_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_795 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [173]:
# %%time

# model.fit(train_tf_dataset.batch(8), validation_data=eval_tf_dataset, epochs=2, batch_size=8)

In [30]:
res = model.predict(train_tf_dataset.take(100).batch(10))

https://www.analyticsvidhya.com/blog/2021/06/why-and-how-to-use-bert-for-nlp-text-classification/

In [3]:
import tensorflow_datasets as tfds

In [4]:
(ds_train, ds_test), ds_info = tfds.load('imdb_reviews', 
          split = (tfds.Split.TRAIN, tfds.Split.TEST),
          as_supervised=True,
          with_info=True)
print('info', ds_info)

info tfds.core.DatasetInfo(
    name='imdb_reviews',
    full_name='imdb_reviews/plain_text/1.0.0',
    description="""
    Large Movie Review Dataset.
    This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.
    """,
    config_description="""
    Plain text
    """,
    homepage='http://ai.stanford.edu/~amaas/data/sentiment/',
    data_path='C:\\Users\\Vojta\\tensorflow_datasets\\imdb_reviews\\plain_text\\1.0.0',
    download_size=80.23 MiB,
    dataset_size=129.83 MiB,
    features=FeaturesDict({
        'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
        'text': Text(shape=(), dtype=tf.string),
    }),
    supervised_keys=('text', 'label'),
    disable_shuffling=False,
    splits={
        'test': <SplitInfo num_examples=25000, num_shards=1>,
        'tr

In [5]:
for review, label in tfds.as_numpy(ds_train.take(5)):
    print('review', review.decode()[0:50], label)

review This was an absolutely terrible movie. Don't be lu 0
review I have been known to fall asleep during films, but 0
review Mann photographs the Alberta Rocky Mountains in a  0
review This is the kind of film for a snowy Sunday aftern 1
review As others have mentioned, all the women that go nu 1


In [23]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [24]:
def convert_example_to_feature(review):
    return tokenizer(review, truncation=True, padding='max_length', max_length=512)

In [30]:
batch_size = 32

In [33]:
# map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
      "input_ids": input_ids,
      "token_type_ids": token_type_ids,
      "attention_mask": attention_masks,
    }, label

def encode_examples(ds, limit=-1):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    if (limit > 0):
        ds = ds.take(limit)

    for review, label in tfds.as_numpy(ds):
        bert_input = convert_example_to_feature(review.decode())
        
        
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

In [34]:
%%time

# train dataset
ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)
# test dataset
ds_test_encoded = encode_examples(ds_test).batch(batch_size)

CPU times: total: 3min 47s
Wall time: 3min 44s


In [35]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

In [36]:
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5
# we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model
number_of_epochs = 1

# model initialization
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# optimizer Adam recommended
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
%%time


bert_history = model.fit(ds_train_encoded.take(100), epochs=number_of_epochs, validation_data=ds_test_encoded.take(100))

In [2]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [3]:
model.summary()

Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [4]:
sentence = "Hello, my dog is cute"

encoded = tokenizer.encode("Hello, <MASK> dog is cute")

encoded1 = tokenizer("Hello, my dog is cute", truncation=True, padding='max_length', max_length=512)
print(encoded)
print(encoded1)

[101, 7592, 1010, 1026, 7308, 1028, 3899, 2003, 10140, 102]
{'input_ids': [101, 7592, 1010, 2026, 3899, 2003, 10140, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [61]:
input_ids = tf.constant(encoded)[None, :]  # Batch size 1


print(input_ids)

tf.Tensor([[b'input_ids' b'token_type_ids' b'attention_mask']], shape=(1, 3), dtype=string)


AttributeError: 'list' object has no attribute 'shape'

In [None]:
input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='input_ids')
input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='attention_mask')
input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='token_type_ids')

In [52]:
outputs = model(input_ids)

print(outputs)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 8, 768), dtype=float32, numpy=
array([[[-0.11437128,  0.19371368,  0.1249586 , ..., -0.38269076,
          0.21065912,  0.54070807],
        [ 0.53082436,  0.32074896,  0.3664591 , ..., -0.00360684,
          0.7578602 ,  0.0388434 ],
        [-0.48765117,  0.88492435,  0.42556354, ..., -0.697621  ,
          0.44583344,  0.12309441],
        ...,
        [-0.70027906, -0.18150637,  0.32969648, ..., -0.48379344,
          0.06802348,  0.890084  ],
        [-1.0354631 , -0.2566779 , -0.03165286, ...,  0.31974417,
          0.39990202,  0.17954747],
        [ 0.6079918 ,  0.2609707 , -0.31307253, ...,  0.03109779,
         -0.6282723 , -0.19942449]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-7.19458342e-01, -2.14449644e-01, -2.95759737e-01,
         3.66029590e-01,  2.79677331e-01,  2.21835729e-02,
         5.72991431e-01,  6.23311587e-02,  5.95868900e-0

In [53]:
last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple

In [57]:
print(last_hidden_states)

tf.Tensor(
[[[-0.11437128  0.19371368  0.1249586  ... -0.38269076  0.21065912
    0.54070807]
  [ 0.53082436  0.32074896  0.3664591  ... -0.00360684  0.7578602
    0.0388434 ]
  [-0.48765117  0.88492435  0.42556354 ... -0.697621    0.44583344
    0.12309441]
  ...
  [-0.70027906 -0.18150637  0.32969648 ... -0.48379344  0.06802348
    0.890084  ]
  [-1.0354631  -0.2566779  -0.03165286 ...  0.31974417  0.39990202
    0.17954747]
  [ 0.6079918   0.2609707  -0.31307253 ...  0.03109779 -0.6282723
   -0.19942449]]], shape=(1, 8, 768), dtype=float32)


In [58]:
clse = last_hidden_states[0, 0, :]

In [59]:
clse.shape


TensorShape([768])

In [60]:
clse

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([-1.14371277e-01,  1.93713680e-01,  1.24958605e-01, -2.03833938e-01,
       -1.43783808e-01, -4.01328981e-01,  3.77295054e-02,  2.35692024e-01,
       -6.07896745e-02, -1.99522629e-01,  5.90228811e-02,  3.77066880e-02,
        3.21126491e-01,  4.20141160e-01, -2.22825021e-01, -5.37456945e-04,
       -2.28127822e-01,  5.35356939e-01, -8.61098990e-03,  1.03082299e-01,
        3.26674841e-02, -1.71210825e-01, -1.52831823e-01, -2.00476810e-01,
        8.08305815e-02,  6.31937832e-02,  1.06357351e-01,  2.71277726e-01,
       -5.52153178e-02,  1.35896146e-01, -1.47843897e-01,  8.43284428e-02,
        4.82296288e-01,  5.44626676e-02,  2.98799843e-01, -1.00577407e-01,
        1.90387905e-01,  1.64552256e-02, -4.87703085e-01,  5.64810783e-02,
       -7.77095705e-02,  5.17870579e-03,  1.72790978e-02, -1.61808014e-01,
       -2.03584030e-01, -2.50444412e-01, -3.30151510e+00, -5.04574627e-02,
       -2.23637670e-02, -5.34201443e-01,  3.79213661

In [64]:
# Max length of encoded string(including special tokens such as [CLS] and [SEP]):
MAX_SEQUENCE_LENGTH = 64 

# Standard BERT model with lowercase chars only:
PRETRAINED_MODEL_NAME = 'bert-base-uncased' 

# Batch size for fitting:
BATCH_SIZE = 16 

# Number of epochs:
EPOCHS=5

In [124]:
def create_model(max_sequence, model_name, num_labels):
    bert_model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    # This is the input for the tokens themselves(words from the dataset after encoding):
    input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='input_ids')

    # attention_mask - is a binary mask which tells BERT which tokens to attend and which not to attend.
    # Encoder will add the 0 tokens to the some sequence which smaller than MAX_SEQUENCE_LENGTH, 
    # and attention_mask, in this case, tells BERT where is the token from the original data and where is 0 pad token:
    attention_mask = tf.keras.layers.Input((max_sequence,), dtype=tf.int32, name='attention_mask')
    
    # Use previous inputs as BERT inputs:
    output = bert_model([input_ids, attention_mask])[0]
    
    print(output)

    # We can also add dropout as regularization technique:
    #output = tf.keras.layers.Dropout(rate=0.15)(output)

    # Provide number of classes to the final layer:
    output = tf.keras.layers.Dense(num_labels, activation='softmax')(output)

    # Final model:
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    return model

In [125]:
model = create_model(MAX_SEQUENCE_LENGTH, PRETRAINED_MODEL_NAME, 2)

opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KerasTensor(type_spec=TensorSpec(shape=(None, 2), dtype=tf.float32, name=None), name='tf_bert_for_sequence_classification_7/classifier/BiasAdd:0', description="created by layer 'tf_bert_for_sequence_classification_7'")


In [126]:
def batch_encode(X, tokenizer):
    return tokenizer.batch_encode_plus(
    X,
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf',
    truncation=True
)

In [100]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [6]:
import numpy as np

In [7]:
X_train = [sentence, sentence]
y_train = np.array([1, 0])
X_val = [sentence, sentence]
y_val = np.array([1, 0])

In [8]:
X_train = batch_encode(X_train, tokenizer)
X_val = batch_encode(X_val, tokenizer)

NameError: name 'batch_encode' is not defined

In [104]:
X_train.values()

dict_values([<tf.Tensor: shape=(2, 64), dtype=int32, numpy=
array([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [  101,  7592,  1010,  2026,  3899,  2003, 10140,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
 

In [105]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 64)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  109483778  ['input_ids[0][0]',              
 ation_2 (TFBertForSequenceClas  rOutput(loss=None,               'attention_mask[0][0]']         
 sification)                    logits=(None, 2),                                                 
                                 hidden_states=None                                         

In [106]:
model.fit(
    x=X_train.values(),
    y=y_train,
    validation_data=(X_val.values(), y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2ed1f0e2b20>

In [146]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf",      
                    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
                  )
inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1))  # Batch size 1




output = model(inputs, output_hidden_states=True)

loss = output.loss
logits = output.logits

print(output)


output = tf.keras.layers.Dense(2, activation='softmax')(tf.reshape(output[0], [-1, 1]))




All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.7734848], dtype=float32)>, logits=<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[ 0.00591445, -0.14878385]], dtype=float32)>, hidden_states=(<tf.Tensor: shape=(1, 64, 768), dtype=float32, numpy=
array([[[ 0.4495986 ,  0.09766434, -0.20736787, ...,  0.05780864,
          0.04061905, -0.09512451],
        [-0.77356714,  0.49493614,  0.40814084, ...,  0.11958291,
         -0.31714237,  0.25474828],
        [-1.143637  ,  0.41187298, -0.28890803, ...,  0.5126003 ,
         -1.2320071 , -0.52256787],
        ...,
        [-0.14134963, -1.0802779 ,  0.8814317 , ...,  0.5289587 ,
         -0.5264092 ,  0.6608591 ],
        [ 0.20395406, -1.2452701 ,  0.910847  , ...,  0.40185875,
         -0.5353736 ,  0.82868004],
        [-0.04057743, -1.1146938 ,  0.71299577, ...,  0.28208214,
         -0.648682  ,  0.5204784 ]]], dtype=float32)>, <tf.Tensor: shape=(1, 64, 768), dtype=float32, numpy=
array([[[ 0.2

In [147]:
loss

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.7734848], dtype=float32)>

In [145]:
output

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0.43494776, 0.5650522 ]], dtype=float32)>

In [123]:
print(inputs)

{'input_ids': <tf.Tensor: shape=(1, 64), dtype=int32, numpy=
array([[  101,  8667,   117,  1139,  3676,  1110, 10509,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]])>, 'attention_mask': <tf.Tensor: shape=(1, 64), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]])>}


In [127]:
print(loss)

tf.Tensor([0.29079425], shape=(1,), dtype=float32)


In [128]:
print(outputs[2])

(<tf.Tensor: shape=(1, 64, 768), dtype=float32, numpy=
array([[[ 0.4495986 ,  0.09766434, -0.20736787, ...,  0.05780864,
          0.04061905, -0.09512451],
        [-0.77356714,  0.49493614,  0.40814084, ...,  0.11958291,
         -0.31714237,  0.25474828],
        [-1.143637  ,  0.41187298, -0.28890803, ...,  0.5126003 ,
         -1.2320071 , -0.52256787],
        ...,
        [-0.14134963, -1.0802779 ,  0.8814317 , ...,  0.5289587 ,
         -0.5264092 ,  0.6608591 ],
        [ 0.20395406, -1.2452701 ,  0.910847  , ...,  0.40185875,
         -0.5353736 ,  0.82868004],
        [-0.04057743, -1.1146938 ,  0.71299577, ...,  0.28208214,
         -0.648682  ,  0.5204784 ]]], dtype=float32)>, <tf.Tensor: shape=(1, 64, 768), dtype=float32, numpy=
array([[[ 0.2717536 , -0.04192287, -0.09590326, ...,  0.04059108,
          0.04587968, -0.06677815],
        [-1.0057184 ,  0.8226264 ,  0.08506905, ...,  0.0218928 ,
         -0.17773648, -0.13762966],
        [-0.6432052 ,  0.01664471, -0.18631

In [121]:
print(len(outputs[2]))

13


In [129]:
print(outputs[2][0])

tf.Tensor(
[[[ 0.4495986   0.09766434 -0.20736787 ...  0.05780864  0.04061905
   -0.09512451]
  [-0.77356714  0.49493614  0.40814084 ...  0.11958291 -0.31714237
    0.25474828]
  [-1.143637    0.41187298 -0.28890803 ...  0.5126003  -1.2320071
   -0.52256787]
  ...
  [-0.14134963 -1.0802779   0.8814317  ...  0.5289587  -0.5264092
    0.6608591 ]
  [ 0.20395406 -1.2452701   0.910847   ...  0.40185875 -0.5353736
    0.82868004]
  [-0.04057743 -1.1146938   0.71299577 ...  0.28208214 -0.648682
    0.5204784 ]]], shape=(1, 64, 768), dtype=float32)


In [9]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import pandas as pd
import numpy as np

In [10]:
sentence = "Hello, my dog is cute"

In [11]:
SEQ_LEN = 50

In [12]:
Xids = np.zeros((2, SEQ_LEN))
Xmask = np.zeros((2, SEQ_LEN))

In [13]:
Xids

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [14]:
for i, sentence in enumerate([sentence, sentence]):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

NameError: name 'tokenize' is not defined

In [93]:
def tokenize(input_sent):
    tokens = tokenizer.encode_plus(input_sent, max_length=SEQ_LEN,
                                   truncation=True, padding='max_length',
                                   add_special_tokens=True, return_attention_mask=True,
                                   return_token_type_ids=False, return_tensors='tf')
    return tokens['input_ids'], tokens['attention_mask']

In [94]:
Xids

array([[  101.,  8667.,   117.,  1139.,  3676.,  1110., 10509.,   102.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.],
       [  101.,  8667.,   117.,  1139.,  3676.,  1110., 10509.,   102.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.]])

In [18]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, [1, 1]))

# restructure dataset format for BERT
def map_func(input_ids, masks, labels):
    print(input_ids)
    return {'input_ids': input_ids, 'attention_mask': masks}, 1
  
dataset = dataset.map(map_func)  # apply the mapping function

Tensor("args_0:0", shape=(50,), dtype=float64)


In [21]:
for x in dataset:
    print(x)
    break

({'input_ids': <tf.Tensor: shape=(50,), dtype=float64, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])>, 'attention_mask': <tf.Tensor: shape=(50,), dtype=float64, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)


In [32]:
bert = TFAutoModel.from_pretrained("bert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
inputs = tokenizer("Hello, my dog is cute", return_tensors="tf",      
                    max_length=SEQ_LEN, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
                  )
inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1))  # Batch size 1


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [16]:
inputs

{'input_ids': <tf.Tensor: shape=(1, 50), dtype=int32, numpy=
array([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(1, 50), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])>, 'labels': <tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[1]])>}

In [17]:
input_ids = tf.keras.layers.Input(shape=(50,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)  # we only keep tensor 0 (last_hidden_state)

# X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality
# X = tf.keras.layers.BatchNormalization()(X)
# X = tf.keras.layers.Dense(128, activation='relu')(X)
# X = tf.keras.layers.Dropout(0.1)(X)
# y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(X)  # adjust based on number of sentiment classes

model = tf.keras.Model(inputs=[input_ids, mask], outputs=embeddings)

# # freeze the BERT layer
# model.layers[2].trainable = False

NameError: name 'bert' is not defined

In [227]:
embeddings = bert(inputs)[0]

In [228]:
embeddings

<tf.Tensor: shape=(1, 50, 768), dtype=float32, numpy=
array([[[ 0.5132389 ,  0.50970554,  0.19912985, ..., -0.38999215,
          0.4052692 , -0.23153389],
        [ 0.5394626 , -0.3658086 ,  0.6667344 , ..., -0.39200184,
          0.25045055,  0.02019719],
        [ 0.7766629 ,  0.6822611 ,  0.7109607 , ..., -0.0420045 ,
         -0.37177944,  0.3748228 ],
        ...,
        [ 0.394462  ,  0.13875982,  0.56905514, ...,  0.01595568,
          0.51019484, -0.08350345],
        [ 0.4279501 ,  0.19366696,  0.20105973, ...,  0.11090432,
          0.4984797 , -0.04191069],
        [ 0.24493073,  0.2278865 ,  0.6375439 , ..., -0.0600812 ,
          0.10338356, -0.02667893]]], dtype=float32)>

In [222]:
embeddings = bert(inputs)[0]  # we only keep tensor 0 (last_hidden_state)


X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality

In [223]:
X

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[ 1.29032290e+00,  1.03555655e+00,  7.10960686e-01,
         1.37796175e+00,  6.69964492e-01,  2.58656532e-01,
         6.63879871e-01,  4.89272952e-01,  4.74286199e-01,
         2.62343884e-01,  1.63592830e-01,  9.18961346e-01,
         4.71095532e-01,  9.40465391e-01,  4.23889682e-02,
         4.41716045e-01,  3.66609573e-01,  5.24260283e-01,
         5.53627014e-01,  1.08773804e+00,  1.00412657e-02,
        -1.65543094e-01,  6.14793241e-01,  8.58429447e-02,
         2.58111030e-01,  5.22225238e-02,  1.06781805e+00,
         1.90751648e+00,  1.78271145e-01,  8.79378259e-01,
         4.80118811e-01,  7.05948472e-01,  9.09945011e-01,
         8.20003569e-01,  2.32826993e-02,  5.24063110e-01,
         5.66205025e-01,  8.27183545e-01,  1.25916183e-01,
         6.86729014e-01,  4.44131285e-01,  2.33704790e-01,
         4.91281956e-01,  1.56880721e-01,  8.58769596e-01,
        -1.53291151e-02,  1.20558053e-01,  3.13506842e-01,
      

In [233]:
embeddings1 = bert(inputs)[0]  # we only keep tensor 0 (last_hidden_state)

print(type(embeddings1))

X1 = tf.keras.layers.GlobalMaxPool1D()(embeddings1)  # reduce tensor dimensionality
X1 = tf.keras.layers.BatchNormalization()(X1)

<class 'tensorflow.python.framework.ops.EagerTensor'>


In [232]:
X1

<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[ 1.28967798e+00,  1.03503895e+00,  7.10605383e-01,
         1.37727308e+00,  6.69629633e-01,  2.58527249e-01,
         6.63548052e-01,  4.89028424e-01,  4.74049151e-01,
         2.62212753e-01,  1.63511068e-01,  9.18502092e-01,
         4.70860094e-01,  9.39995348e-01,  4.23677824e-02,
         4.41495270e-01,  3.66426349e-01,  5.23998260e-01,
         5.53350329e-01,  1.08719444e+00,  1.00362469e-02,
        -1.65460363e-01,  6.14485979e-01,  8.58000442e-02,
         2.57982016e-01,  5.21964245e-02,  1.06728435e+00,
         1.90656316e+00,  1.78182051e-01,  8.78938735e-01,
         4.79878843e-01,  7.05595672e-01,  9.09490228e-01,
         8.19593728e-01,  2.32710633e-02,  5.23801208e-01,
         5.65922022e-01,  8.26770127e-01,  1.25853255e-01,
         6.86385810e-01,  4.43909317e-01,  2.33587995e-01,
         4.91036415e-01,  1.56802312e-01,  8.58340383e-01,
        -1.53214540e-02,  1.20497800e-01,  3.13350141e-01,
      

In [229]:
embeddings = bert(input_ids, attention_mask=mask)[0]

In [230]:
type(embeddings)

keras.engine.keras_tensor.KerasTensor

In [75]:
input_ids = tf.keras.layers.Input(shape=(50, ), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(50, ), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]  # we only keep tensor 0 (last_hidden_state)

X = tf.keras.layers.GlobalMaxPool1D()(embeddings)  # reduce tensor dimensionality
X = tf.keras.layers.BatchNormalization()(X)
X = tf.keras.layers.Dense(128, activation='relu')(X)
X = tf.keras.layers.Dropout(0.1)(X)
y = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')(X)  # adjust based on number of sentiment classes

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

# # freeze the BERT layer
# model.layers[2].trainable = False

In [83]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                          

In [76]:
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                          

In [66]:
model.layers[2].trainable = False

In [67]:
# optimizer = tf.keras.optimizers.Adam(0.01)
# loss = tf.keras.losses.CategoricalCrossentropy()
# acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [86]:
#model.compile(metrics=[acc])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [69]:
# model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [84]:
for x in dataset.batch(1):
    print(x)
    break

({'input_ids': <tf.Tensor: shape=(1, 50), dtype=float64, numpy=
array([[  101.,  8667.,   117.,  1139.,  3676.,  1110., 10509.,   102.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
            0.,     0.]])>, 'attention_mask': <tf.Tensor: shape=(1, 50), dtype=float64, numpy=
array([[1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])>}, <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1])>)


In [71]:
history = model.fit(dataset, epochs=2)

Epoch 1/2


ValueError: in user code:

    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 808, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "tf_bert_model_1" (type TFBertModel).
    
    in user code:
    
        File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1127, in call  *
            outputs = self.bert(
        File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        ValueError: Exception encountered when calling layer "bert" (type TFBertMainLayer).
        
        in user code:
        
            File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 773, in call  *
                batch_size, seq_length = input_shape
        
            ValueError: not enough values to unpack (expected 2, got 1)
        
        
        Call arguments received:
          • input_ids=tf.Tensor(shape=(50,), dtype=int32)
          • attention_mask=tf.Tensor(shape=(50,), dtype=int32)
          • token_type_ids=None
          • position_ids=None
          • head_mask=None
          • inputs_embeds=None
          • encoder_hidden_states=None
          • encoder_attention_mask=None
          • past_key_values=None
          • use_cache=True
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=True
          • kwargs=<class 'inspect._empty'>
    
    
    Call arguments received:
      • input_ids=tf.Tensor(shape=(50,), dtype=int32)
      • attention_mask=tf.Tensor(shape=(50,), dtype=int32)
      • token_type_ids=None
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • encoder_hidden_states=None
      • encoder_attention_mask=None
      • past_key_values=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=True
      • kwargs=<class 'inspect._empty'>


In [59]:
history.history

{'loss': [0.6928972005844116, 0.6918980479240417], 'accuracy': [0.5, 1.0]}

In [62]:
test = model.predict(dataset)



ValueError: in user code:

    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 1621, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 1611, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 1604, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 1572, in predict_step
        return self(x, training=False)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None

    ValueError: Exception encountered when calling layer "tf_bert_model_1" (type TFBertModel).
    
    in user code:
    
        File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 1127, in call  *
            outputs = self.bert(
        File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler  **
            raise e.with_traceback(filtered_tb) from None
    
        ValueError: Exception encountered when calling layer "bert" (type TFBertMainLayer).
        
        in user code:
        
            File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\transformers\models\bert\modeling_tf_bert.py", line 773, in call  *
                batch_size, seq_length = input_shape
        
            ValueError: not enough values to unpack (expected 2, got 1)
        
        
        Call arguments received:
          • input_ids=tf.Tensor(shape=(50,), dtype=int32)
          • attention_mask=tf.Tensor(shape=(50,), dtype=int32)
          • token_type_ids=None
          • position_ids=None
          • head_mask=None
          • inputs_embeds=None
          • encoder_hidden_states=None
          • encoder_attention_mask=None
          • past_key_values=None
          • use_cache=True
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=False
          • kwargs=<class 'inspect._empty'>
    
    
    Call arguments received:
      • input_ids=tf.Tensor(shape=(50,), dtype=int32)
      • attention_mask=tf.Tensor(shape=(50,), dtype=int32)
      • token_type_ids=None
      • position_ids=None
      • head_mask=None
      • inputs_embeds=None
      • encoder_hidden_states=None
      • encoder_attention_mask=None
      • past_key_values=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=False
      • kwargs=<class 'inspect._empty'>


In [61]:
test

array([[0.57419837],
       [0.57419837]], dtype=float32)

In [256]:
test['last_hidden_state'].shape

(2, 50, 768)

In [29]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 50)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 50,                                          

In [31]:
# compile the model
optimizer = tf.keras.optimizers.Adam(0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

# and train it


Epoch 1/20


ValueError: in user code:

    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\training.py", line 809, in train_step
        loss = self.compiled_loss(
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\losses.py", line 1664, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\Vojta\Desktop\diploma\venv\lib\site-packages\keras\backend.py", line 4994, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None,) and (None, 50, 768) are incompatible


In [87]:
history = model.fit(dataset.batch(1), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
