<a href="https://colab.research.google.com/github/poffertje/TextMining/blob/master/code/sentiment_analysis/FT_BERT_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Classifier 
This code is heavily based on the tutorial from https://www.coursera.org/learn/fine-tune-bert-tensorflow/ungradedLti/ack5t/fine-tune-bert-for-text-classification-with-tensorflow



In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


**Import Requirements**

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
!git clone --depth 1 -b v2.3.0 https://github.com/tensorflow/models.git
!pip install -q tf-models-official==2.4.0

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tensorflow_hub as hub
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization

In [None]:
print("TF Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

TF Version:  2.8.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


In [None]:
import os
os.chdir('/content/gdrive/Shareddrives/Minecraft/Datasets')

**Import Dataset**

In [None]:
yelp_dataset = pd.read_csv('8April_100k_sentiment_sample_25_75_mixed.csv')

In [None]:
display(yelp_dataset)

Unnamed: 0,sentiment label,userID,productID,rating,label,date,review
0,1,27044,5022,5.0,1,2014-02-08,We sort of just stumbled upon this place by ac...
1,1,38418,3745,4.0,1,2009-11-17,"As a ramen-lover, I was so excited when this p..."
2,0,37316,4340,1.0,1,2011-08-01,this is the place that you swore off from your...
3,1,27225,3544,5.0,1,2013-06-24,WOW! What an amazing restaurant! My girlfriend...
4,0,23707,3987,2.0,1,2011-12-30,While my friends thought the food was excellen...
...,...,...,...,...,...,...,...
99995,1,32153,256,5.0,1,2014-05-05,"Fresh delicious food!!! 9 dumplings, pineapple..."
99996,1,22606,136,5.0,1,2014-08-07,"A tasty café, with an array of vegan sandwiche..."
99997,0,12838,139,2.0,1,2011-02-28,Went here for drinks on a Wednesday night. Wh...
99998,1,73270,886,4.0,1,2010-05-06,Just ate lunch at The Spot and *yum*! We both ...


**Splitting the Dataset into Training and Validation**

In [None]:
training_data = yelp_dataset.sample(frac=0.9, random_state=25)
val_data = yelp_dataset.drop(training_data.index)
display(training_data)
display(val_data)

Unnamed: 0,sentiment label,userID,productID,rating,label,date,review
43469,1,83587,983,5.0,1,2013-08-26,Authentic Japanese cuisine. Had to try their s...
20973,0,108187,4267,3.0,1,2010-01-05,I'm only giving this place 3 stars because the...
42833,0,7015,56,3.0,1,2013-07-04,"Many of the vendors are good, but there are ab..."
1568,1,23277,2017,5.0,1,2012-09-29,"If someone ever demands me to take him ""somewh..."
83899,1,166202,2657,5.0,1,2011-02-06,"Went here I think on a Sunday afternoon, the p..."
...,...,...,...,...,...,...,...
82033,1,16255,2440,4.0,1,2014-11-20,"Huge fan of Blend in the neighborhood, so I wa..."
2849,1,36490,2642,5.0,1,2014-06-25,Never though I would like vegan sushi but I di...
37187,1,41634,395,4.0,1,2010-01-28,I went to Cookshop for Sunday brunch with a gr...
89359,0,58103,4752,3.0,1,2013-06-30,We came for Saturday brunch as part of my brot...


Unnamed: 0,sentiment label,userID,productID,rating,label,date,review
20,1,16675,4616,5.0,1,2013-06-05,Holy mackerel. Best. Meal. Ever. And I don't s...
22,1,13463,4995,4.0,1,2012-04-29,Banh mi oh my! Definitely had one of the best...
49,1,74813,3176,5.0,1,2009-03-16,Every dish I had here was the best of its kind...
50,1,71252,4745,5.0,1,2011-03-20,"Solid, spicy Cambodian food. #14 is delicious."
62,0,84204,2409,2.0,1,2012-11-09,It pains me to write this. I have been going t...
...,...,...,...,...,...,...,...
99920,1,11048,1364,4.0,1,2009-02-20,"In these troubled times, we need a place where..."
99925,1,45088,458,5.0,1,2013-07-14,One of my favorite restaurants in fort greene....
99927,0,15843,3767,3.0,1,2010-12-23,Came here originally for Soba-ya's handmade so...
99953,0,27207,4552,2.0,1,2014-04-30,This is an okay place overall and if it were n...


In [None]:
print(training_data.value_counts('sentiment label'))
print(val_data.value_counts('sentiment label'))

sentiment label
1    66926
0    23074
dtype: int64
sentiment label
1    7387
0    2613
dtype: int64


Converting datasets into Tensors

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((training_data.review.values, training_data['sentiment label'].values))
valid_data = tf.data.Dataset.from_tensor_slices((val_data.review.values, val_data['sentiment label'].values))

for text, label in train_data.take(1):
  print(text)
  print(label)

Loading pre-trained BERT model, initialising the tokenizer and defining some parameters

In [None]:
label_list = [0, 1] # Label categories
max_seq_length = 128 # maximum length of (token) input sequences
train_batch_size = 16
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

Functions to Create Suitable Input Representation for the BERT model

In [None]:
def to_feature(text, label, label_list=label_list, max_seq_length=max_seq_length, tokenizer=tokenizer):
  example = classifier_data_lib.InputExample(guid = None,
                                            text_a = text.numpy(), 
                                            text_b = None, 
                                            label = label.numpy())
  feature = classifier_data_lib.convert_single_example(0, example, label_list,
                                    max_seq_length, tokenizer)
  
  return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)

In [None]:
def to_feature_map(text, label):
  input_ids, input_mask, segment_ids, label_id = tf.py_function(to_feature, inp=[text, label], 
                                Tout=[tf.int32, tf.int32, tf.int32, tf.int32])

  # py_func doesn't set the shape of the returned tensors.
  input_ids.set_shape([max_seq_length])
  input_mask.set_shape([max_seq_length])
  segment_ids.set_shape([max_seq_length])
  label_id.set_shape([])

  x = {
        'input_word_ids': input_ids,
        'input_mask': input_mask,
        'input_type_ids': segment_ids
    }
  return (x, label_id)

In [None]:

 # train
train_data = (train_data.map(to_feature_map,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .shuffle(100)
                          .batch(train_batch_size, drop_remainder=True)
                          .prefetch(tf.data.experimental.AUTOTUNE))

# valid
valid_data = (valid_data.map(to_feature_map,
                            num_parallel_calls=tf.data.experimental.AUTOTUNE)
                          .batch(train_batch_size, drop_remainder=True)
                          .prefetch(tf.data.experimental.AUTOTUNE)) 

In [None]:
train_data.element_spec

In [None]:
valid_data.element_spec

Create Classification Head Layer 

In [None]:
def create_model():
  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                      name="input_word_ids")
  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_mask")
  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                  name="input_type_ids")

  pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])

  drop = tf.keras.layers.Dropout(0.3)(pooled_output)
  output = tf.keras.layers.Dense(1, activation="sigmoid", name="output")(drop)

  model = tf.keras.Model(
    inputs={
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    },
    outputs=output)
  return model

Training and Saving the Weights of the Model

In [None]:
model_name = '100k-2e5-16-02-8April_sentiment_sample_25_75_mixed_weights'
with tf.device('/device:GPU:0'):
  model = create_model()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=[tf.keras.metrics.BinaryAccuracy()])
  model.summary()

  history = model.fit(train_data, epochs=3, validation_data=valid_data)
  model.save_weights(model_name)
  model.save(model_name)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 128, 768)]                'input_mask[0][0]',         



INFO:tensorflow:Assets written to: 100k-2e5-16-02-8April_sentiment_sample_25_75_mixed/assets


INFO:tensorflow:Assets written to: 100k-2e5-16-02-8April_sentiment_sample_25_75_mixed/assets


Loading the trained model

In [None]:
model_name = '100k-2e5-16-02-8April_sentiment_sample_25_75_mixed_weights'
trained_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
trained_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-08, clipnorm=1.0), 
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
                 metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
trained_model.load_weights(model_name)
trained_model.summary()