<a href="https://colab.research.google.com/github/sahug/ds-bert/blob/main/BERT%20NLP%20-%20Session%205%20-%20Multi%20Class%20Classification%20Using%20BERT%20and%20Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**BERT NLP - Session 5 - Multi Class Classification Using BERT and Tensorflow.ipynb**

In [2]:
#Import Data
import pandas as pd

df = pd.read_csv("/content/sample_data/train.tsv", sep="\t")
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [3]:
#Data Preprocessing. We have to pare the data in the BERT input format. For that we will need TokenIDs and Attention Mask.

import numpy as np

seq_len = 512 #Limiting to 512 seq/tokens
num_samples = len(df) #Sample size is equal to dataframe size

Xids = np.zeros((num_samples, seq_len)) #Token IDS
Xmask = np.zeros((num_samples, seq_len)) #Attention Mask

Xids.shape

(156060, 512)

In [None]:
!pip install transformers

In [5]:
#Populating Xids and Xmask with actual values.

from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

#Encoding Phrase
#Max Len = Seq Len
#Truncating any word after 512
#Padding any sentence less than 512 to 512 to keep the size same
#Special Tokens like [CLS], [SEP], [PAD]
#Return Tensorflow Tensors

for i, phrase in enumerate(df["Phrase"]):
  tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True, padding="max_length", add_special_tokens=True, return_tensors="tf")

  Xids[i, :] = tokens["input_ids"]
  Xmask[i, :] = tokens["attention_mask"] 


In [6]:
#[101] - CLS Token
#[0] - PAD Tokens

Xids

array([[  101.,   138.,  1326., ...,     0.,     0.,     0.],
       [  101.,   138.,  1326., ...,     0.,     0.,     0.],
       [  101.,   138.,  1326., ...,     0.,     0.,     0.],
       ...,
       [  101.,   170., 25247., ...,     0.,     0.,     0.],
       [  101.,   170., 25247., ...,     0.,     0.,     0.],
       [  101., 22572., 12148., ...,     0.,     0.,     0.]])

In [7]:
#1 - Attention Token for word
#0 - No Attention Token
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [8]:
#Create Labels
arr = df["Sentiment"].values
labels = np.zeros((num_samples, arr.max()+1)) #+1 because arr starts with 0
labels.shape, labels

((156060, 5), array([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        ...,
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]]))

In [9]:
#Apply OHE on Labels
labels[np.arange(num_samples), arr] = 1
labels

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [10]:
#Tensorflow

import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

dataset.take(1)

<TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.float64, name=None), TensorSpec(shape=(512,), dtype=tf.float64, name=None), TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [11]:
#Merge Input Tensors into a single dictonary. The reason we do is so that we have input at index 0 and output at index 1 when feeding the data to model

def map_func(input_ids, masks, labels):
  return {"input_ids": input_ids, "attention_mask": masks}, labels


In [12]:
#Apply Map

dataset = dataset.map(map_func)

dataset.take(1)

#Now we can see that the input and mask is on 1 tensor, input, and the label in the other tensor, output.

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.float64, name=None)}, TensorSpec(shape=(5,), dtype=tf.float64, name=None))>

In [13]:
batch_size = 16

dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

#Now we can see we have 16 samples every tensor.

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 5), dtype=tf.float64, name=None))>

In [14]:
#Creating Training and Test Data

split = 0.9

size = int((num_samples/batch_size) * split)

In [15]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

del dataset

In [16]:
#Load pretrained BERT Model

from transformers import TFAutoModel

bert = TFAutoModel.from_pretrained("bert-base-uncased")

bert.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [17]:
#Input Layer
input_ids = tf.keras.layers.Input(shape=(seq_len), name="input_ids", dtype="int32")
mask = tf.keras.layers.Input(shape=(seq_len), name="attention_mask", dtype="int32")

#Embeddings from BERT. Connecting input_ids and mask to bertmodel
embeddings = bert.bert(input_ids, attention_mask=mask)[1]

#Convert Embedding in to label prediction
x = tf.keras.layers.Dense(1024, activation="relu")(embeddings)
y = tf.keras.layers.Dense(arr.max()+1, activation="softmax", name="outputs")(x)

In [18]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 512,                                           

In [19]:
#Define optimizer, loss and metric

optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)

loss = tf.keras.losses.CategoricalCrossentropy()

acc = tf.keras.metrics.CategoricalAccuracy("accuracy")

  super(Adam, self).__init__(name, **kwargs)


In [20]:
#Compile Model
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [None]:
#Train Model

history = model.fit(
    train_ds,
    validation_data = val_ds,
    epochs=3
)

In [None]:
model.save("sentiment_model")

In [None]:
#Prep Data for Testing
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

def prep_data(text):
  tokens = tokenizer.encode_plus(text, max_length=512, truncation=True, padding="max_length", add_special_tokens=True, return_token_type_id=False, return_tensor="tf")

  return{
      "input_ids": tf.cast(tokens["input_ids"], tf.float64),
       "attention_mask": tf.cast(tokens["attention_mask"], tf.float64)
  }

In [None]:
test = prep_data("hellow world")

In [None]:
probs = model.predict(test)

probs[0] #Returns the probability of all 5 labels

In [None]:
np.argmax(probs[0]) #Returns the label with highest probability among the labels