In [2]:
!pip install -q -U tensorflow-text --quiet

[K     |████████████████████████████████| 4.9 MB 3.9 MB/s 
[K     |████████████████████████████████| 462 kB 26.2 MB/s 
[?25h

In [3]:
!pip install -q tf-models-official

[K     |████████████████████████████████| 2.2 MB 4.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 39.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 37.3 MB/s 
[K     |████████████████████████████████| 99 kB 8.2 MB/s 
[K     |████████████████████████████████| 90 kB 6.5 MB/s 
[K     |████████████████████████████████| 636 kB 47.5 MB/s 
[K     |████████████████████████████████| 234 kB 48.5 MB/s 
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
[K     |████████████████████████████████| 47.8 MB 45 kB/s 
[K     |████████████████████████████████| 352 kB 39.7 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [4]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [5]:
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'

# keras uitlity to download a file
dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url, untar=True, cache_dir='.', cache_subdir='')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [6]:
dataset_dir = '/content/aclImdb'

train_dir = os.path.join(dataset_dir,'train')

In [7]:
# remove unused folders to make it easier to load the data
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [8]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 123

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [9]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(3):
    print(f'Review: {text_batch.numpy()[i]}')
    label = label_batch.numpy()[i]
    print(f'Label : {label} ({class_names[label]})')

Review: b'After, I watched the films... I thought, "Why the heck was this film such a high success in the Korean Box Office?" Even thought the movie had a clever/unusal scenario, the acting wasn\'t that good and the characters weren\'t very interesting. For a Korean movie... I liked the fighting scenes. If you want to watch a film without thinking, this is the film for you. But I got to admit... the film was kind of childish... 6/10'
Label : 1 (pos)
Review: b'Or released on DVD or screened on a cable channel like Amer. Life TV network. I have been watching another favorite, "Voyage to the Bottom of the Sea", as well as "Lost in Space" and Land of Giants". They\'ve been showing them forever but aren\'t receptive to suggestions for other shows. My father and I were big fans as I was already a big science/electronics nut, (still am) and my father was an old school chum of Nader. They both attended Oxy together. I still have memories of several of the episodes even though I was only 9. Mor

In [10]:
#@title Choose a BERT model to fine-tune

bert_model_name = 'bert_en_uncased_L-12_H-768_A-12'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

BERT model selected           : https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Preprocess model auto-selected: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3


In [11]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [12]:
text_test = ['this is such an amazing movie!. I hate the movie', 'hello world']
text_preprocessed = bert_preprocess_model(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"]}')

Keys       : ['input_word_ids', 'input_type_ids', 'input_mask']
Shape      : (2, 128)
Word Ids   : [ 101 2023 2003 2107 2019 6429 3185  999 1012 1045 5223 1996]
Input Mask : [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Type Ids   : [[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0

In [13]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [14]:
bert_results = bert_model(text_preprocessed)

print(f'Loaded BERT: {tfhub_handle_encoder}')
print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')

Loaded BERT: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Pooled Outputs Shape:(2, 768)
Pooled Outputs Values:[-0.85526216 -0.33712423 -0.34576908  0.57520497  0.38547078 -0.08428411
  0.44962656  0.20554389 -0.5116271  -0.9999461  -0.00613103  0.83263266]
Sequence Outputs Shape:(2, 128, 768)
Sequence Outputs Values:[[ 0.38575327 -0.11281332  0.2981122  ... -0.1316025   0.8360076
   0.28022325]
 [-0.50844646 -0.32244614  0.19049025 ... -0.6035149   1.2848651
   0.24631244]
 [-0.08842065 -0.00249235  0.41535258 ... -0.39508688  0.7541475
   0.4246072 ]
 ...
 [ 0.4423539   0.48887545  0.11532195 ... -0.28149486  0.48708117
   0.48195764]
 [ 0.39349335  0.22193849  0.32810032 ...  0.5328984   0.90055084
   0.42265135]
 [-0.02805402 -0.54921985  0.04732138 ... -0.12337839  1.4534825
  -0.01174106]]


In [15]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [16]:
classifier_model = build_classifier_model()
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

tf.Tensor(
[[0.19168532]
 [0.2257794 ]], shape=(2, 1), dtype=float32)


In [17]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [18]:
epochs = 3
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [19]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [20]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

Loss: 0.40803593397140503
Accuracy: 0.8866000175476074


In [22]:
# dataset_name = 'imdb'
# saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

# classifier_model.save(saved_model_path, include_optimizer=False)

In [23]:
# reloaded_model = tf.saved_model.load(saved_model_path)

In [24]:
import tensorflow_datasets as tfds

In [25]:
text = []
for text_batch, label_batch in test_ds.take(782):
    text.append(text_batch.numpy())

In [26]:
review = []
for i in range(782):
  for x in text[i].tolist():
    review.append(x)

In [33]:
# classifier_model.predict(stringlist[:100])

In [27]:
stringlist = [x.decode('utf-8') for x in review]

In [None]:
bert_raw_result = classifier_model(tf.constant(stringlist))

In [38]:
pred_test = tf.sigmoid(bert_raw_result)

In [39]:
pred_test

<tf.Tensor: shape=(100, 1), dtype=float32, numpy=
array([[1.14429346e-03],
       [6.56694756e-04],
       [9.64505017e-01],
       [6.16917200e-02],
       [7.83895492e-04],
       [9.76338685e-01],
       [2.57240189e-03],
       [7.67604681e-04],
       [9.70268309e-01],
       [1.17672002e-03],
       [1.51271722e-03],
       [9.97238398e-01],
       [8.13097417e-01],
       [8.09938821e-04],
       [5.88641107e-01],
       [9.97752726e-01],
       [4.03841678e-03],
       [2.53060320e-03],
       [2.42060772e-03],
       [9.90192755e-04],
       [9.01901687e-04],
       [9.96664464e-01],
       [2.10419875e-02],
       [9.97775018e-01],
       [1.68832198e-01],
       [9.97576296e-01],
       [9.84956324e-01],
       [9.97829378e-01],
       [1.20407995e-03],
       [9.97815490e-01],
       [9.57358420e-01],
       [9.71617294e-04],
       [8.81352127e-01],
       [9.97318566e-01],
       [9.96941864e-01],
       [4.82522309e-01],
       [9.95937943e-01],
       [5.76440915e-02],


In [None]:
label = []
for text_batch, label_batch in test_ds.take(782):
    label.append(label_batch.numpy())

In [None]:
real = []
for i in range(782):
  for x in label[i].tolist():
    real.append(x)

In [None]:
import pandas as pd
df = pd.DataFrame({"review": review, "pred": pred_test})

In [None]:
from google.colab import files

df.to_csv('output.csv', encoding = 'utf-8-sig') 
files.download('output.csv')