In [72]:
!pip install tensorflow
!pip install -q tf-models-official==2.4.0

In [73]:
import os

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [74]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

In [75]:
hub_url_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"

In [76]:
df = pd.read_json("../input/squad-process-2/squad_process_2.json")

In [77]:
df.head()

In [78]:
len(df)

In [79]:
df["mask"].value_counts()

In [80]:
df_new = pd.DataFrame()

In [81]:
# Creating new dataframe with labels that have adequate sampling for training

keep = [["what"], ["who"], ["when"], ["which"], ["how", "many"], ["where"], ["how"], ["why"], ["how", "much"]]

for w in keep:
    counter = 0
    for i in range(len(df)):
        if df["mask"][i] == w:
            counter += 1
            if counter <= 975:
                df_new = df_new.append(df.iloc[i])

In [82]:
df_new

In [83]:
df_new["mask"].value_counts()

In [84]:
len(df_new)

In [85]:
df_new.reset_index(drop=True, inplace=True)

PREPROCESSING DATA

In [86]:
df_new.head()

In [87]:
# Formatting data so that the mask in question = "<qw>"
for i in range(len(df_new)):
    df_new["question"][i] = df_new["question"][i].split()
    for j in range(len(df_new["question"][i])):
        if df_new["question"][i][j] in df_new["mask"][i]:
            if "<qw>" not in df_new["question"][i]:
                df_new["question"][i][j] = "<qw>" 
            else:
                df_new["question"][i].pop(j)
                break

In [88]:
df_new["inputs"] = np.nan

In [89]:
for i in range(len(df_new)):
    df_new["question"][i] = " ".join(df_new["question"][i])
    df_new["answers"][i] = " ".join(df_new["answers"][i])
    df_new["inputs"][i] = [df_new["question"][i], df_new["answers"][i]]

In [90]:
labels_unique = []
idx = []
num = 0
for i in range(len(df_new["mask"])):
    if df_new["mask"][i] not in labels_unique:
        labels_unique.append(df_new["mask"][i])
        idx.append(num)
        num += 1

labels_idx = np.stack((idx, labels_unique)).T

In [91]:
labels_idx

In [92]:
# Turning masks into numerical cateogries
for i in range(len(df_new)):
    for j in range(len(labels_idx)):
        if df_new["mask"][i] == labels_idx[j][1]:
            df_new["mask"][i] = labels_idx[j][0]

In [93]:
df_new.head()

In [94]:
X_train_og, X_test_og, y_train_og, y_test_og = train_test_split(df_new["inputs"], df_new["mask"], test_size=0.1, random_state=42)

In [95]:
print(X_train_og.shape)
print(X_test_og.shape)
print(y_train_og.shape)
print(y_test_og.shape)

In [96]:
X_train = {"questions": [], "answers": [], "labels": []}

for row in X_train_og:
    X_train["questions"].append(row[0])
    X_train["answers"].append(row[1])

for row in y_train_og:
    X_train["labels"].append(row)

In [97]:
X_test = {"questions": [], "answers": [], "labels": []}

for row in X_test_og:
    X_test["questions"].append(row[0])
    X_test["answers"].append(row[1])

for row in y_test_og:
    X_test["labels"].append(row)

In [98]:
df_train = pd.DataFrame(X_train)
df_train

In [99]:
df_test = pd.DataFrame(X_test)
df_test

In [100]:
for key, value in X_train.items():
  print(f"{key:9s}: {value[0]}")

In [101]:
for key, value in X_test.items():
  print(f"{key:9s}: {value[0]}")

BERT tokenizer

In [102]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
     do_lower_case=True)

print("Vocab size:", len(tokenizer.vocab))

In [103]:
tokens = tokenizer.tokenize("Hello TensorFlow!")
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

Encode the sentences

In [104]:
tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])

In [105]:
X_train["questions"] = tf.convert_to_tensor(X_train["questions"])
X_train["answers"] = tf.convert_to_tensor(X_train["answers"])

X_test["questions"] = tf.convert_to_tensor(X_test["questions"])
X_test["answers"] = tf.convert_to_tensor(X_test["answers"])

In [106]:
X_train["questions"]

In [107]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s.numpy()))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

questions = tf.ragged.constant([
    encode_sentence(s) for s in X_train["questions"]])
answers = tf.ragged.constant([
    encode_sentence(s) for s in X_train["answers"]])

In [108]:
print("answers shape:", questions.shape.as_list())
print("answers shape:", answers.shape.as_list())

In [109]:
questions[0]

In [110]:
cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*questions.shape[0]
input_word_ids = tf.concat([cls, answers], axis=-1)
_ = plt.pcolormesh(input_word_ids.to_tensor())

In [111]:
input_mask = tf.ones_like(input_word_ids).to_tensor()

plt.pcolormesh(input_mask)

In [112]:
type_cls = tf.zeros_like(cls)
type_s1 = tf.zeros_like(questions)
type_s2 = tf.ones_like(answers)
input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

plt.pcolormesh(input_type_ids)

In [113]:
def encode_sentence(s, tokenizer):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(data, tokenizer):
  num_examples = len(data["questions"])

  questions = tf.ragged.constant([
      encode_sentence(s, tokenizer) 
      for s in np.array(data["questions"])])
  answers = tf.ragged.constant([
      encode_sentence(s, tokenizer)
       for s in np.array(data["answers"])])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*questions.shape[0]
  input_word_ids = tf.concat([cls, questions, answers], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(questions)
  type_s2 = tf.ones_like(answers)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs

In [114]:
X_train["answers"]

In [115]:
train_data = bert_encode(X_train, tokenizer)
train_labels = np.asarray(X_train["labels"]).astype('int64')
train_labels = tf.convert_to_tensor(train_labels)

test_data = bert_encode(X_test, tokenizer)
test_labels = np.asarray(X_test["labels"]).astype('int64')
test_labels = tf.convert_to_tensor(test_labels)

In [116]:
print(train_data["input_word_ids"].shape)
print(train_data["input_mask"].shape)
print(train_data["input_type_ids"].shape)
print(train_labels.shape)

print(test_data["input_word_ids"].shape)
print(test_data["input_mask"].shape)
print(test_data["input_type_ids"].shape)
print(test_labels.shape)

The Model

In [117]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

In [118]:
print(len(labels_unique))

In [119]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels = len(labels_unique))

In [120]:
tf.keras.utils.plot_model(bert_classifier, show_shapes=True, dpi=48)

In [121]:
data_batch = {key: val[:10] for key, val in train_data.items()}

bert_classifier(
    data_batch, training=True
).numpy()

In [122]:
tf.keras.utils.plot_model(bert_encoder, show_shapes=True, dpi=48)

In [123]:
checkpoint = tf.train.Checkpoint(encoder=bert_encoder)
checkpoint.read(
    os.path.join(gs_folder_bert, 'bert_model.ckpt')).assert_consumed()

Set up the optimizer

In [124]:
# Set up epochs and steps
epochs = 10
batch_size = 32
eval_batch_size = 32

train_data_size = len(train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [125]:
type(optimizer)

Train the model

In [126]:
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      train_data, train_labels,
      validation_data=(test_data, test_labels),
      batch_size=32, epochs=epochs)

In [129]:
example = {
        'questions':[
            '<qw> is your plane now?.'],
        'answers':[
            'in London.']
    }

my_examples = bert_encode(example, tokenizer)
result = bert_classifier(my_examples, training=False)
result = result.numpy()[0]
pred_label = int(np.where(result == np.amax(result))[0])

labels_idx[pred_label][1]

## Save model

In [131]:
export_dir='./saved_model'
tf.saved_model.save(bert_classifier, export_dir=export_dir)