<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/sentiment_analysis/FTBERT_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code to Evaluation Our Fine-tuned BERT Models 
The code is heavily based on: https://www.coursera.org/learn/fine-tune-bert-tensorflow/ungradedLti/ack5t/fine-tune-bert-for-text-classification-with-tensorflow

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.chdir('/content/gdrive/Shareddrives/Minecraft/Datasets')

Imports and Installations

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
!git clone --depth 1 -b v2.3.0 https://github.com/tensorflow/models.git
!pip install -q tf-models-official==2.4.0
!pip install -q transformers

fatal: destination path 'models' already exists and is not an empty directory.
[K     |████████████████████████████████| 1.1 MB 14.7 MB/s 
[K     |████████████████████████████████| 596 kB 72.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 73.9 MB/s 
[K     |████████████████████████████████| 1.1 MB 67.9 MB/s 
[K     |████████████████████████████████| 99 kB 12.5 MB/s 
[K     |████████████████████████████████| 47.8 MB 105 kB/s 
[K     |████████████████████████████████| 237 kB 104.6 MB/s 
[K     |████████████████████████████████| 43 kB 2.3 MB/s 
[K     |████████████████████████████████| 352 kB 85.4 MB/s 
[K     |████████████████████████████████| 462 kB 67.0 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 4.0 MB 14.2 MB/s 
[K     |████████████████████████████████| 895 kB 70.0 MB/s 
[K     |████████████████████████████████| 77 kB 8.9 MB/s 


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tensorflow_hub as hub
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

In [None]:
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

TF Version:  2.8.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


Importing the Datasets

In [None]:
test_set = pd.read_csv('test_sample24.csv')
full_prod_test_set = pd.read_csv('8April_production_set.csv')

Definining Constant Parameters

In [None]:
threshold = 0.6
label_list = [0, 1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
true_labels = test_set['sentiment label']
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)



In [None]:
full_prod_true_labels = full_prod_test_set['sentiment label']

Functions to Preprocess Text Input

In [None]:
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
  example = classifier_data_lib.InputExample(guid = None,
                                            text_a = text.numpy(), 
                                            text_b = None, 
                                            label = label.numpy())
  feature = classifier_data_lib.convert_single_example(0, example, label_list,
                                    max_seq_length, tokenizer)
  
  return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)

In [None]:
#wrapper function 
def to_feature_map(text, label):
  input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], 
                                Tout=[tf.int32, tf.int32, tf.int32, tf.int32])
  input_ids.set_shape([max_seq_length])
  input_mask.set_shape([max_seq_length])
  segment_ids.set_shape([max_seq_length])
  label_id.set_shape([])

  x = {
        'input_word_ids': input_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }
  return (x, label_id)

Creation of Custom Layer

In [None]:
dropout = 0.3
def create_model():
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_mask")
  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_type_ids")

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

  drop = tf.keras.layers.Dropout(dropout)(pooled_output)
  output = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(drop)

  model = tf.keras.Model(
    inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
    outputs=output)
  return model

Prediction Function

In [None]:
def predict_labels(sample_data,model):
  test_data = tf.data.Dataset.from_tensor_slices((sample_data, [0]*len(sample_data)))
  test_data = (test_data.map(to_feature_map).batch(1))
  preds = model.predict(test_data)
  return preds

In [None]:
def convert_from_threshold(sample_data, model, threshold):
  pred_probs = predict_labels(sample_data,model)
  predicted = [1 if pred > threshold else 0 for pred in pred_probs]
  return predicted

# Evaluation of Model trained on Imbalanced Training Set

In [None]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
imbalanced_model = create_model()
imbalanced_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])
imbalanced_model.load_weights('100k-2e5-16-02-8April_sentiment_sample_25_75_mixed_weights')
imbalanced_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',         

1. Evaluation of Model on Development Test Set

In [None]:
imbalance_pred_labels = convert_from_threshold(test_set['review'],imbalanced_model,threshold)

In [None]:
print(classification_report(true_labels,imbalance_pred_labels))

2. Evaluation of Model on Production Test Set

In [None]:
imbal_full_prod_pred_labels = convert_from_threshold(full_prod_test_set['review'],imbalanced_model,threshold)

In [None]:
print(classification_report(full_prod_true_labels,imbal_full_prod_pred_labels))

# Evaluation of Model trained on Balanced Training Set

In [None]:
# Get BERT layer and tokenizer:
# More details here: https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
balanced_model = create_model()
balanced_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])
#balanced_model.load_weights('100k-b16-2e5-dropout03-apr8-mixed-50-50_bert_weights')
balanced_model.load_weights('b16-2e5-dropout03-apr8-mixed-50-50_bert_weights')
balanced_model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 keras_layer_12 (KerasLayer)    [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',       

1. Evaluation of Model on Development Test Set

In [None]:
balance_pred_labels = convert_from_threshold(test_set['review'],balanced_model,threshold)

In [None]:
print(classification_report(true_labels,balance_pred_labels))

2. Evaluation of Model on Production Test Set

In [None]:
bal_full_prod_pred_labels = convert_from_threshold(test_set['review'],balanced_model,threshold)
print(classification_report(full_prod_true_labels,bal_full_prod_pred_labels))