In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import tensorflow_text as text


from tensorflow.keras import layers
from tensorflow.keras import losses
from official.nlp import optimization  # to create AdamW optimizer

from sklearn.model_selection import train_test_split

In [2]:
# !pip install datasets
# !pip install -U "tensorflow-text==2.13.*"
# !pip install "tf-models-official==2.13.*"

In [3]:
# downloaded from here:
# https://huggingface.co/datasets/TimKoornstra/synthetic-financial-tweets-sentiment/tree/main/data
# synthetic = pd.read_parquet("train-00000-of-00001.parquet")

# label definition is according to the doc:
#. A numerical label indicating the sentiment of the tweet, 
# with '1' for bullish, '2' for bearish, and '0' for neutral sentiments.

In [4]:
# tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1"
# tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1"
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

In [5]:
# def get_encoding_model(tfhub_handle_encoder):
#     input_layer = tf.keras.layers.Input(shape=(), dtype=tf.string, name='input_text')
    
#     input_encoded = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')(input_layer)
    
#     return tf.keras.Model(input_layer, input_encoded)

In [6]:
# encoding_model = get_encoding_model(tfhub_handle_encoder)

In [7]:
# from tqdm import tqdm

In [8]:
df = pd.read_csv("./imdb_full_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,14149,I had two reasons for watching this swashbuckl...,0
1,8946,"This is, in my opinion, a very good film, espe...",1
2,22378,I knew this film was supposed to be so bad it ...,0
3,12162,"When the US entered World War I, the governmen...",1
4,4879,Few movies can be viewed almost 60 years later...,1


In [9]:
def preprocess_text(input_text: str) -> str:
    s = input_text.lower()
    s = (s
         .replace('<br />', ' ')
         .replace('`', "'")
         .replace('´',"'")
         .replace(" '", ' "')
         .replace("-", " - ")
         .replace("/", " ")
         .replace("_", " ")
        )
    return s

In [10]:
df["text"] = df["text"].map(preprocess_text)

In [11]:
text_train, text_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size=0.2,
    random_state=1,
)

In [12]:
# corpus = list(df["text"])

In [13]:
# def batchify(lst, batch_size=16): 
#     indx_ = 0
    
#     while indx_ < len(lst):
#         yield lst[indx_: indx_ + batch_size]
#         indx_ += batch_size

In [14]:
# batch_size = 16
# encoder_seq_length = 128

# batches = list(batchify(corpus, batch_size=batch_size))

# input_type_ids = np.zeros((len(corpus), encoder_seq_length), dtype=np.int32)
# input_mask = np.zeros((len(corpus), encoder_seq_length), dtype=np.int32)
# input_word_ids =  np.zeros((len(corpus), encoder_seq_length), dtype=np.int32)

# indx_ = 0
# for batch in tqdm(batches):
#     batch_len = len(batch)
#     x = encoding_model(tf.convert_to_tensor(batch))
#     input_type_ids[indx_: indx_ + batch_len, :] = x['input_type_ids'].numpy()
#     input_mask[indx_: indx_ + batch_len, :] = x['input_mask'].numpy()
#     input_word_ids[indx_: indx_ + batch_len, :] = x['input_word_ids'].numpy()
#     indx_ += batch_len

In [15]:
def get_model(tfhub_handle_encoder, tfhub_handle_preprocess=None):
    
#     input_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
    
#     input_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask')
    
#     input_word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
    
#     input_encoded = {
#         'input_type_ids': input_type_ids,
#         'input_mask': input_mask,
#         'input_word_ids': input_word_ids,
#     }

    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    
    bert_preprocess = hub.load(tfhub_handle_preprocess)
    
    # without this by default the sequence
    # gets truncated to 128 which is too short.
    
    tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
    
    tokenized = tokenizer(text_input)
    
    packer = hub.KerasLayer(
        bert_preprocess.bert_pack_inputs,
        arguments=dict(seq_length=512),
        name='packer',
    )

    encoder_inputs = packer([tokenized]) 
    

    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    
    outputs = encoder(encoder_inputs)
    
    x = outputs['pooled_output']
    
    x = tf.keras.layers.Dropout(0.5)(x)
    
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    
    x = tf.keras.layers.Dropout(0.1)(x)
    
    x = tf.keras.layers.Dense(1, activation="sigmoid", name='classifier')(x)

    
    return tf.keras.Model(text_input, x)

#     bert_output = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')(input_encoded)
    
#     x = bert_output['pooled_output']
    
#     x = tf.keras.layers.Dropout(0.1)(x)
    
#     x = tf.keras.layers.Dense(1, activation="sigmoid", name='classifier')(x)
    
#     return tf.keras.Model([input_type_ids, input_mask, input_word_ids], x)

In [16]:
model = get_model(tfhub_handle_encoder, 
                  tfhub_handle_preprocess=tfhub_handle_preprocess)

In [17]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 text (InputLayer)           [(None,)]                    0         []                            
                                                                                                  
 tokenizer (KerasLayer)      (None, None, None)           0         ['text[0][0]']                
                                                                                                  
 packer (KerasLayer)         {'input_type_ids': (None,    0         ['tokenizer[0][0]']           
                             512),                                                                
                              'input_word_ids': (None,                                            
                             512),                                                            

In [18]:
epochs = 10
batch_size = 32

steps_per_epoch = len(df) // batch_size
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5

In [19]:
# for some reason this optimizer is significantly better than Adam!
# Adam doesn't even converge for train error!
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [20]:
model.compile(loss=losses.BinaryCrossentropy(),
              # optimizer='adam',
              optimizer=optimizer,
              metrics=tf.metrics.BinaryAccuracy(threshold=0.5))

In [21]:
history = model.fit(
    text_train,
    y_train,
    validation_data=(text_test, y_test),
    epochs=epochs,
    batch_size=batch_size,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [41]:
model = tf.saved_model.load('bert_trained_on_imdb.saved_model')

In [43]:
model.__call__(text_train[:3], )

<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[9.9978596e-01],
       [2.3196169e-05],
       [3.5396297e-05]], dtype=float32)>

In [44]:
model.__call__(["""We achieved revenue records across more than two dozen countries and regions,
including all-time records in Europe and rest of Asia-Pacific. We also continue to see
strong double-digit growth in many emerging markets with all-time records in Malaysia,
Mexico, The Philippines, Poland, and Turkey, as well as December quarter records in
India, Indonesia, Saudi Arabia, and Chile."""], training=False)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.9998093]], dtype=float32)>