#Keras BERT Text Classification

In [None]:
!pip install sentencepiece

In [None]:
#Bert tokenization class
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
#importing libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
import tokenization
import tensorflow_hub as hub
import logging
logging.basicConfig(level=logging.INFO)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns

In [None]:
#model with adam optimizer
adam = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
sgd = keras.optimizers.SGD(learning_rate=0.001, decay=1e-6, momentum=0.9, nesterov=True)
adadelta = keras.optimizers.Adadelta(learning_rate=1.0, rho=0.9, epsilon=None, decay=0.0)

In [None]:
#Builiding BERT layer
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [None]:
#Reading train.jsonl
train_df = pd.read_json('train.jsonl', lines=True)

In [None]:
#Readinf dev_seen.jsonl
dev_seen_df = pd.read_json('dev_seen.jsonl', lines=True)

#Concatenating train_df and dev_seen_df
training_data = pd.concat([train_df, dev_seen_df])

In [None]:
#Validation Data
dev_df = pd.read_json('dev_unseen.jsonl', lines=True)

In [None]:
#Splitting the data into training and testing
df_train, df_test = train_test_split(
    training_data,
    test_size=0.05,
    random_state=0
)

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
#Encoding the text(preprocessing)
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
#defining the model
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dropout(0.2)(clf_output)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    out = tf.keras.layers.Dense(2, activation='sigmoid')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
train_input = bert_encode(df_train.text.values, tokenizer, max_len=100)
test_input = bert_encode(df_test.text.values, tokenizer, max_len=100)
traiin_labels = tf.keras.utils.to_categorical(df_train.label.values, num_classes=2)
test_labels =  tf.keras.utils.to_categorical(df_test.label.values, num_classes=2)


dev_input = bert_encode(dev_df.text.values, tokenizer, max_len=100)
dev_labels = tf.keras.utils.to_categorical(dev_df.label.values, num_classes=2)

In [None]:
text_model = build_model(bert_layer, max_len=100)
text_model.summary()

In [None]:
#run model
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = text_model.fit(
    train_input, traiin_labels, 
    validation_split=0.2,
    epochs=50,
    callbacks=[checkpoint, earlystopping],
    batch_size=32,
    verbose=1
    )

In [None]:
#Predict
text_model.load_weights('model.h5')
test_pred = text_model.predict_generator(test_input, steps=450)

test_pred = np.argmax(test_pred, axis=1)

In [None]:
import matplotlib.pyplot as plt

In [None]:
#training validation accuracy graph
plt.plot(train_history.history['accuracy'], label='training acc')
plt.plot(train_history.history['val_accuracy'], label='validation acc')

plt.title('Training and validation accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend()
plt.show()

In [None]:
print(classification_report(df_test.label, test_pred, target_names=['Non-Offensive(0)','Offensive(1)']))

In [None]:
#Confusion Matrix

def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');
cm = confusion_matrix(df_test.label, test_pred)
df_cm = pd.DataFrame(cm, index=['Non-Offensive(0)','Offensive(1)'], columns=['Non-Offensive(0)','Offensive(1)'])
show_confusion_matrix(df_cm)

#Text classification using BERT and Hugging face transformers


In [None]:
!pip install -qq transformers

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt

from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
#reading train.jsonl
train_df = pd.read_json("train.jsonl", lines=True)
train_df.head()

In [None]:
#reading dev_see.jsonl
dev_seen_df = pd.read_json("dev_seen.jsonl", lines=True)

#concatenating train and dev_seen
training_data = pd.concat([train_df, dev_seen_df])

In [None]:
dev_unseen_df = pd.read_json("dev_unseen.jsonl", lines=True)

In [None]:
training_data.shape

train_df = 8500
dev_seen_df = 500

total training data = 8500+500 = 9000

In [None]:
training_data.info()

In [None]:
class_name = ['Non-Offensive(0)', 'Offensive(1)']

ax = sns.countplot(training_data['label'])
plt.xlabel('labels')
ax.set_xticklabels(class_name);

In [None]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [None]:
#sample text for understanding
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f' Tokens:{tokens}')
print(f' Token ids:{token_ids}')

In [None]:
tokenizer.sep_token, tokenizer.sep_token_id

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)
encoding.keys()

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

In [None]:
token_lens = []
for txt in training_data['text']:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');

In [None]:
MAX_LEN = 100

In [None]:
#Creating a Pytorch Dataset
class ClassificationDataset(Dataset):
  def __init__(self, text, targets, tokenizer, max_len):
    self.text = text
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  def __len__(self):
    return len(self.text)
  def __getitem__(self, item):
    text = str(self.text[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
#Splitting the data into train and test
df_train, df_test = train_test_split(
    training_data,
    test_size=0.05,
    random_state=RANDOM_SEED
)

In [None]:
df_test.shape

In [None]:
#Defining Helper function for creating data loader
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = ClassificationDataset(
    text=df.text.to_numpy(),
    targets=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )
  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 16

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(dev_unseen_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)

In [None]:
last_hidden_state, pooled_output = bert_model(input_ids=encoding['input_ids'],attention_mask=encoding['attention_mask'])

In [None]:
last_hidden_state.shape

In [None]:
pooled_output.shape

In [None]:
bert_model.config.hidden_size

In [None]:
#Classifier that uses Bert Model
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
model = SentimentClassifier(len(class_name))
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
import torch.nn.functional as F

#predicted probabilies for out sample_txt
F.softmax(model(input_ids, attention_mask), dim=1)

In [None]:
EPOCHS = 100
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
#Helper function for training our model
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
#Helper function to evaluate the model for the given data loaders
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)
      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )
  print(f'Train loss {train_loss} accuracy {train_acc}')
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(dev_unseen_df)
  )
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()
  history['accuracy'].append(train_acc)
  history['loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
#Training v/s Validation Accuracy Graph
plt.plot(history['accuracy'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)
test_acc.item()

In [None]:
#Helper function for getting predictions from the model
def get_predictions(model, data_loader):
  model = model.eval()
  text = []
  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:
      texts = d["text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)
      text.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return text, predictions, prediction_probs, real_values

In [None]:
y_text, y_pred, y_pre_probs, y_test = get_predictions(
    model,
    test_data_loader
)

In [None]:
#Classification Report
print(classification_report(y_test, y_pred, target_names=class_name))

In [None]:
#Confusion Matrix
def show_confusion_matrix(confusion_matrix):
  hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
  hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
  hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
  plt.ylabel('True sentiment')
  plt.xlabel('Predicted sentiment');
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_name, columns=class_name)
show_confusion_matrix(df_cm)

In [None]:
idx = 4
text= y_text[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame({
  'class_names': class_name,
  'values': y_pre_probs[idx]
})

In [None]:
print("\n".join(wrap(text)))
print()
print(f'True sentiment: {class_name[true_sentiment]}')

In [None]:
sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
plt.ylabel('label')
plt.xlabel('probability')
plt.xlim([0, 1]);

##Some example predictions

In [None]:
sample_df = pd.read_json("dev_unseen.jsonl", lines=True)

In [None]:
sample_df

In [None]:
text = []
for i in range(10):
  text.append(sample_df['text'][i])

In [None]:
text

In [None]:
encoded_text_list = []
for i in range(10):
  encoded_text = tokenizer.encode_plus(
    text[i],
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
  )
  encoded_text_list.append(encoded_text)

In [None]:
for i in range(10):
  input_ids = encoded_text_list[i]['input_ids'].to(device)
  attention_mask = encoded_text_list[i]['attention_mask'].to(device)
  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  print(f'Text: {text[i]}')
  print(f'Label  : {class_name[prediction]}')

In [None]:
sample_text = "i'm gonna be like phelps one day"


In [None]:
encoded_text = tokenizer.encode_plus(
    sample_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
  )

In [None]:
input_ids = encoded_text_list[i]['input_ids'].to(device)
attention_mask = encoded_text_list[i]['attention_mask'].to(device)
output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)
print(f'Text: {sample_text}')
print(f'Label  : {class_name[prediction]}')