# Text Extraction with BERT
Author: Apoorv Nandan
Date created: 2020/05/23
Last modified: 2020/05/23

https://keras.io/examples/nlp/text_extraction_with_bert/

In [1]:
import os
import re
import json
import string
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

max_len = 384
configuration = BertConfig()  # default parameters and configuration for BERT


In [2]:
from pandas.io.json import json_normalize

In [3]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)


In [4]:
# train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
# train_path = keras.utils.get_file("train.json", train_data_url)
eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
eval_path = keras.utils.get_file("eval.json", eval_data_url)


Downloading data from https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json


In [5]:
# def load_data(file):

#     lines = []
#     with open(file, 'rb') as json_file:
#         for json_line in json_file:
#             lines.append(json.loads(json_line))
#         data = json_normalize(lines)
#         data.columns = data.columns.map(lambda x: x.split(".")[-1])
#     return data

# mytrain = load_data('../dataset/train_rand_split.jsonl')
# mydev = load_data('../dataset/dev_rand_split.jsonl')


In [6]:

class SquadExample:
    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
        self.question = question
        self.context = context
        self.start_char_idx = start_char_idx
        self.answer_text = answer_text
        self.all_answers = all_answers
        self.skip = False

    def preprocess(self):
        context = self.context
        question = self.question
        answer_text = self.answer_text
        start_char_idx = self.start_char_idx

        # Clean context, answer and question
        context = " ".join(str(context).split())
        question = " ".join(str(question).split())
        answer = " ".join(str(answer_text).split())

        # Find end character index of answer in context
        end_char_idx = start_char_idx + len(answer)
        if end_char_idx >= len(context):
            self.skip = True
            return

        # Mark the character indexes in context that are in answer
        is_char_in_ans = [0] * len(context)
        for idx in range(start_char_idx, end_char_idx):
            is_char_in_ans[idx] = 1

        # Tokenize context
        tokenized_context = tokenizer.encode(context)

        # Find tokens that were created from answer characters
        ans_token_idx = []
        for idx, (start, end) in enumerate(tokenized_context.offsets):
            if sum(is_char_in_ans[start:end]) > 0:
                ans_token_idx.append(idx)

        if len(ans_token_idx) == 0:
            self.skip = True
            return

        # Find start and end token index for tokens from answer
        start_token_idx = ans_token_idx[0]
        end_token_idx = ans_token_idx[-1]

        # Tokenize question
        tokenized_question = tokenizer.encode(question)

        # Create inputs
        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
            tokenized_question.ids[1:]
        )
        attention_mask = [1] * len(input_ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        padding_length = max_len - len(input_ids)
        if padding_length > 0:  # pad
            input_ids = input_ids + ([0] * padding_length)
            attention_mask = attention_mask + ([0] * padding_length)
            token_type_ids = token_type_ids + ([0] * padding_length)
        elif padding_length < 0:  # skip
            self.skip = True
            return

        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.start_token_idx = start_token_idx
        self.end_token_idx = end_token_idx
        self.context_token_to_char = tokenized_context.offsets



def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                answer_text = qa["answers"][0]["text"]
                all_answers = [_["text"] for _ in qa["answers"]]
                start_char_idx = qa["answers"][0]["answer_start"]
                squad_eg = SquadExample(
                    question, context, start_char_idx, answer_text, all_answers
                )
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples


def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "start_token_idx": [],
        "end_token_idx": [],
    }
    for item in squad_examples:
        if item.skip == False:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
    return x, y



In [7]:

# with open(train_path) as f:
#     raw_train_data = json.load(f)

with open(eval_path) as f:
    raw_eval_data = json.load(f)



In [8]:
raw_eval_data.keys()

dict_keys(['data', 'version'])

In [9]:
len(raw_eval_data["data"])

48

In [10]:
# raw_eval_data["data"][2]

In [11]:
data_copy = {}
data_copy["version"] = raw_eval_data["version"]
data_copy["data"] = raw_eval_data["data"][0:1]

In [12]:

# train_squad_examples = create_squad_examples(raw_train_data)
# x_train, y_train = create_inputs_targets(train_squad_examples)
# print(f"{len(train_squad_examples)} training points created.")

# eval_squad_examples = create_squad_examples(raw_eval_data)
# x_eval, y_eval = create_inputs_targets(eval_squad_examples)
# print(f"{len(eval_squad_examples)} evaluation points created.")


train_squad_examples = create_squad_examples(data_copy)
x_train, y_train = create_inputs_targets(train_squad_examples[0:10])
print(f"{len(train_squad_examples)} training points created.")


810 training points created.


In [13]:
print(len(x_train))
print(len(x_train[0]))
print(len(x_train[1]))
print(len(x_train[2]))

3
10
10
10


In [14]:
print(len(y_train))
print(len(y_train[0]))
print(len(y_train[1]))

2
10
10


In [15]:
print(type(x_train))
print(type(x_train[0]))
print(type(x_train[1]))
print(type(x_train[2]))

<class 'list'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [16]:
print(type(x_train))
print((x_train[0].shape))
print((x_train[1].shape))
print((x_train[2].shape))

<class 'list'>
(10, 384)
(10, 384)
(10, 384)


In [21]:
# Copy paste from https://keras.io/examples/nlp/text_extraction_with_bert/
def create_model():
    ## BERT encoder
    encoder = TFBertModel.from_pretrained("distilbert-base-uncased") # To cope with OOM error, changed it to distilbert

    ## QA Model
    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
    embedding = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )[0]

    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
    start_logits = layers.Flatten()(start_logits)

    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
    end_logits = layers.Flatten()(end_logits)

    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
    end_probs = layers.Activation(keras.activations.softmax)(end_logits)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[start_probs, end_probs],
    )
    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
    optimizer = keras.optimizers.Adam(lr=5e-5)
    model.compile(optimizer=optimizer, loss=[loss, loss])
    return model

# def create_model():
#     ## BERT encoder
#     encoder = TFBertModel.from_pretrained("bert-base-uncased")

#     ## QA Model
#     input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
#     token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
#     attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
#     embedding = encoder(
#         input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
#     )[0]

# #     start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
# #     start_logits = layers.Flatten()(start_logits)

# #     end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
# #     end_logits = layers.Flatten()(end_logits)

# #     start_probs = layers.Activation(keras.activations.softmax)(start_logits)
# #     end_probs = layers.Activation(keras.activations.softmax)(end_logits)
    
#     # Feed inputs through the bert model, 
#     # then take just the vector associated with first token [CLS]
#     bert_cls_output = embedding[0]
    
#     # These are the layers that come after Bert.
#     dense = tf.keras.layers.Dense(256, activation='relu', name='dense')(bert_cls_output)
        
#     # Output layer to predict correct answer. 
#     # For the future, we may modify it to choose the max candidate answer of each question
#     # for now, just predict from 0 to 1 whether this looks like a correct answer. 
#     pred = tf.keras.layers.Dense(1, activation='sigmoid', name='correct')(dense)
    

#     model = keras.Model(
#         inputs=[input_ids, token_type_ids, attention_mask],
#         outputs=pred,
#     )
#     loss = "binary_crossentropy"
#     optimizer = keras.optimizers.Adam(lr=5e-5)
#     model.compile(optimizer=optimizer, loss=loss)
#     return model



In [22]:
model = create_model()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




ResourceExhaustedError: OOM when allocating tensor with shape[30522,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:TruncatedNormal]

In [23]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  1


In [None]:
model.fit(x_train, y_train)



In [1]:
print()


