# **FAQ Classification using BERT**

Classifying UGC as FAQ or Not FAQ using Google's BERT Classifier

Author: Shreyash Gupta

Organization: IndiaMART InterMESH Pvt. Ltd.

_Executed on Google Colab with GPU environment_

# **Loading Data**

Importing necessary modules

In [None]:
import pandas as pd
from google.colab import drive

Mounting to Google Drive

In [None]:
drive.mount('/content/gdrive')

Reading training and testing data

In [None]:
train = pd.read_excel("/content/gdrive/My Drive/Colab Notebooks/faq_bert_text.xlsx")
test = pd.read_excel("/content/gdrive/My Drive/Colab Notebooks/faq_bert_text_test.xlsx")

Defiining Data Column and Label Column

In [None]:
DATA_COLUMN = 'Question Title'
LABEL_COLUMN = 'Remarks'
label_list = [0, 1]

# Data preprocessing

Importing necessary modules

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

Lowercasing the text

In [None]:
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file = vocab_file, do_lower_case = do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

### **Extracting train and test features**

Creating examples of BERT input

In [None]:
train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid = None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)
test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid = None, text_a = x[DATA_COLUMN], text_b = None, label = x[LABEL_COLUMN]), axis = 1)

Extracting train and test features

In [None]:
MAX_SEQ_LENGTH = 128
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

# **Creating the model**

Defining a function to create a classification model

In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels, num_labels):

  bert_module = hub.Module(BERT_MODEL_HUB, trainable=True)
  bert_inputs = dict(input_ids = input_ids, input_mask = input_mask, segment_ids = segment_ids)
  bert_outputs = bert_module(inputs = bert_inputs, signature = "tokens", as_dict = True)
  output_layer = bert_outputs["pooled_output"]
  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable("output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02))
  output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
    
    logits = tf.matmul(output_layer, output_weights, transpose_b = True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis = -1)

    one_hot_labels = tf.one_hot(labels, depth = num_labels, dtype = tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis = -1, output_type = tf.int32))
    if is_predicting:
      return (predicted_labels, log_probs)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis = -1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


Wrapping model in a model function builder function

In [None]:
def model_fn_builder(num_labels, learning_rate, num_train_steps, num_warmup_steps):
  
  def model_fn(features, labels, mode, params):

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    if not is_predicting:
      (loss, predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer( loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu = False)

      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(label_ids, predicted_labels)
        auc = tf.metrics.auc(label_ids, predicted_labels)
        recall = tf.metrics.recall(label_ids, predicted_labels)
        precision = tf.metrics.precision(label_ids, predicted_labels) 
        true_pos = tf.metrics.true_positives(label_ids, predicted_labels)
        true_neg = tf.metrics.true_negatives(label_ids, predicted_labels)   
        false_pos = tf.metrics.false_positives(label_ids, predicted_labels)  
        false_neg = tf.metrics.false_negatives(label_ids, predicted_labels)
        
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
  
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  return model_fn

Defining variables and parameters for training

In [None]:
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100
OUTPUT_DIR = "/content/gdrive/My Drive/Colab Notebooks/"

Computing number of training and warmup steps

In [None]:
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

Defining the run congifuration for estimator

In [None]:
run_config = tf.estimator.RunConfig(model_dir = OUTPUT_DIR, save_summary_steps = SAVE_SUMMARY_STEPS, save_checkpoints_steps = SAVE_CHECKPOINTS_STEPS)

Creating the model function and estimator

In [None]:
model_fn = model_fn_builder(num_labels = len(label_list), learning_rate = LEARNING_RATE, num_train_steps = num_train_steps, num_warmup_steps = num_warmup_steps)
estimator = tf.estimator.Estimator(model_fn = model_fn, config = run_config, params = {"batch_size": BATCH_SIZE})

Creating the training input function

In [None]:
train_input_fn = bert.run_classifier.input_fn_builder(features = train_features, seq_length = MAX_SEQ_LENGTH, is_training = True, drop_remainder = False)

# **Training the model**

Importing necessary modules

In [None]:
from datetime import datetime

Model training

In [None]:
start_time = datetime.now()
estimator.train(input_fn = train_input_fn, max_steps = num_train_steps)
print("Training time: ",datetime.now() - start_time)

# **Testing the model**

Creating the test input function

In [None]:
test_input_fn = run_classifier.input_fn_builder(features = test_features, seq_length = MAX_SEQ_LENGTH, is_training = False, drop_remainder = False)

Evaluating the model

In [None]:
estimator.evaluate(input_fn = test_input_fn, steps = None)

# **Predicting labels**

Defining prediction function

In [None]:
def getPrediction(in_sentences):
  labels = ["Not FAQ", "FAQ"]
  input_examples = [run_classifier.InputExample(guid = "", text_a = x, text_b = None, label = 0) for x in in_sentences]
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
  predict_input_fn = run_classifier.input_fn_builder(features = input_features, seq_length = MAX_SEQ_LENGTH, is_training = False, drop_remainder = False)
  predictions = estimator.predict(predict_input_fn)
  return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

Creating a list of input test sentences

In [None]:
pred_sentences = test["Question Title"].tolist()

Getting predictions

In [None]:
predictions = getPrediction(pred_sentences)

Creating Data Frame for predictions


In [None]:
pred = pd.DataFrame(columns = ["Question Title","Actual", "Predicted"])
pred["Question Title"] = [predictions[i][0] for i in range(len(predictions))]
test["Remarks"].replace({0 : "Not FAQ", 1: "FAQ"}, inplace = True)
pred["Actual"] = test["Remarks"]
pred["Predicted"] = [predictions[i][2] for i in range(len(predictions))]

Exporting result to excel sheet

In [None]:
pred.to_excel("/content/gdrive/My Drive/Colab Notebooks/faq_bert_predictions.xlsx")