# Setup

In [4]:
import os
import shutil
import csv

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

# Dataset

In [29]:
def load_dataset(filename):
    texts = []
    sentiments = []
    with open(filename, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter=';')
        next(spamreader)
        for row in spamreader:
            texts.append(row[1])
            sentiments.append(int(row[3]))
    return texts, sentiments

In [30]:
train_text, train_sentiment = load_dataset('archive/TrainingDatasets/Train100.csv')
test_text, test_sentiment = load_dataset('archive/TestDatasets/Test.csv')

# Model

## Preprocessamento

In [None]:
bert_preprocess_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")

In [17]:
text_test = ['Esse filme foi muito bom!']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (1, 128)
Word Ids   : [  101 56967 14949 10448 16159 32965   106   102     0     0     0     0]
Input Mask : [1 1 1 1 1 1 1 1 0 0 0 0]
Type Ids   : [0 0 0 0 0 0 0 0 0 0 0 0]


## BERT

In [18]:
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

In [20]:
bert_results = bert_model(text_preprocessed)

print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Pooled Outputs Shape:(1, 768)
Pooled Outputs Values:[ 0.41222095 -0.0834593   0.22603445 -0.19547874 -0.06469175  0.3030224
  0.1690162   0.27314597 -0.43111426  0.3715175   0.11120066 -0.21761796]
Sequence Outputs Shape:(1, 128, 768)
Sequence Outputs Values:[[-0.01321952  0.05062676 -0.256091   ...  0.27634612  0.1281554
   0.38889942]
 [ 0.17092174 -0.3653136  -0.6135957  ...  0.37439245 -0.09712169
   0.90014803]
 [ 0.00431772 -0.11753033 -0.22828746 ...  0.42023346 -0.15920073
   1.02552   ]
 ...
 [-0.23814008 -0.03345276 -0.10709733 ...  0.16313751  0.13683175
   0.4828204 ]
 [-0.1844944  -0.28810385 -0.2606587  ...  0.20075007 -0.02328371
   0.5593509 ]
 [-0.23556802 -0.25838915 -0.20264709 ...  0.2003926   0.02817362
   0.4927881 ]]


## Custom Model Definition

In [23]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3", name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4", trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [24]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor([[0.2239469]], shape=(1, 1), dtype=float32)


# Training

In [25]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [26]:
classifier_model.compile(optimizer='Adam',
                         loss=loss,
                         metrics=metrics)

In [None]:
history = classifier_model.fit(x=train_text, y=train_sentiment, epochs=5)

Epoch 1/5
   2/3125 [..............................] - ETA: 144:24:16 - loss: 1.2498 - binary_accuracy: 0.5312