## Introduction to NLP   at Karakun             Part 4    

# Sentiment analysis with Transformers
https://medium.com/atheros/text-classification-with-transformers-in-tensorflow-2-bert-2f4f16eff5ad

In [1]:
#!pip install transformers

In [2]:
## install ipywidgets for displaying progress bar: https://ipywidgets.readthedocs.io/en/stable/user_install.html
#!pip install ipywidgets
#!pip install transformers
#!jupyter nbextension enable --py widgetsnbextension

In [16]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

from transformers import BertTokenizer as Tokenzier, TFBertForSequenceClassification as Classification
# from transformers import DistilBertTokenizer as Tokenizer, TFDistilBertForSequenceClassification as Classification
# from transformers import RobertaTokenizer as Tokenzier, TFRobertaForSequenceClassification as Classification

from sklearn.model_selection import train_test_split

import random
random.seed(0)

In [17]:
# see https://huggingface.co/transformers/pretrained_models.html
model_name = 'bert-base-uncased'
# model_name = 'distilbert-base-uncased'
# model_name = 'roberta-base'

# Load data

In [18]:
train_input = []
test_input  = []

with open('./Sentiment-Analysis-Data/IMDb/train-pos.txt','r') as f:
    for line in f:
        train_input.append((line,1))

with open('./Sentiment-Analysis-Data/IMDb/train-neg.txt','r') as f:
    for line in f:
        train_input.append((line,0))

with open('./Sentiment-Analysis-Data/IMDb/test-pos.txt','r') as f:
    for line in f:
        test_input.append((line,1))

with open('./Sentiment-Analysis-Data/IMDb/test-neg.txt','r') as f:
    for line in f:
        test_input.append((line,0))

random.shuffle(train_input)
random.shuffle(test_input)

In [19]:
N_test= int(len(test_input)/2)

eval_input = test_input[:N_test]
test_input = test_input[N_test:]

print(len(train_input),len(test_input),len(eval_input))

25000 12500 12500


# Prepare data

In [20]:
tokenizer = Tokenizer.from_pretrained(model_name, do_lower_case=True)
max_length = 512

def convert_example_to_feature(review):
    return tokenizer.encode_plus(
        review,                      
        add_special_tokens = True,    # add [CLS], [SEP]
        truncation=True,
        max_length = max_length,      # max length of the text that can go to BERT
        pad_to_max_length = True,     # add [PAD] tokens
        return_attention_mask = True, # add attention mask to not focus on pad tokens
    )

# map to the expected input to TFBertForSequenceClassification, see here 
def map_example_to_dict(input_ids, attention_masks, label):
    return {
      'input_ids': input_ids,
      'attention_mask': attention_masks,
    }, label

def encode_examples(ds):
    # prepare list, so that we can build up final TensorFlow dataset from slices.
    input_ids_list = []
    attention_mask_list = []
    label_list = []

    for review, label in ds:
        bert_input = convert_example_to_feature(review)

        input_ids_list.append(bert_input['input_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
        
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, label_list)).map(map_example_to_dict)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [21]:
batch_size = 8

ds_train_encoded = encode_examples(train_input).shuffle(len(train_input)).batch(batch_size)
ds_eval_encoded  = encode_examples(eval_input).batch(batch_size)

# Train model

In [22]:
# model initialization

model = Classification.from_pretrained(model_name)
# model = Classification.from_pretrained("/Users/christianr/Temp/Bert.h5")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForSequenceClassification: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['dropout_37', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# choosing Adam optimizer
# optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08)
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)

# we do not have one-hot vectors, we can use sparce categorical cross entropy and accuracy
loss   = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
history = model.fit(ds_train_encoded, epochs=2, validation_data=ds_eval_encoded)

Epoch 1/2
 553/3125 [====>.........................] - ETA: 13:24:59 - loss: 0.3412 - accuracy: 0.8483

In [10]:
model.save_pretrained("/Users/christianr/Temp/Bert.h5")

I0811 09:10:15.989963  3416 configuration_utils.py:142] Configuration saved in C:\AAA_i4Ds\karakun\models\DistilBert.h5\config.json
I0811 09:10:16.594266  3416 modeling_tf_utils.py:330] Model weights saved in C:\AAA_i4Ds\karakun\models\DistilBert.h5\tf_model.h5


# Predict Test set

In [11]:
from sklearn.metrics import accuracy_score
import numpy as np

In [12]:
ds_test_encoded = encode_examples(test_input).batch(batch_size)
true_labels = [label for text, label in test_input]

In [13]:
predictions = model.predict(ds_test_encoded)

In [14]:
predictions[:5]

array([[ 2.3694243 , -2.523152  ],
       [ 0.99449015, -1.1562936 ],
       [-1.8344041 ,  1.4256612 ],
       [ 1.9844244 , -2.1433427 ],
       [ 0.82688046, -0.9694176 ]], dtype=float32)

In [15]:
pred_labels = np.argmax(predictions, axis=1)

In [16]:
pred_labels[:5]

array([0, 0, 1, 0, 0], dtype=int64)

In [17]:
accuracy_score(true_labels, pred_labels) # for 2 epochs and batch_size = 8 :
                                         # 0.92804 for BERT /  0.9225 for DistilBERT / 0.9268 for RoBERTa

0.92256

In [18]:
from sklearn.metrics import confusion_matrix

confusion_matrix(true_labels, pred_labels)

array([[5789,  458],
       [ 510, 5743]], dtype=int64)