#**Importing the BERT Tokenizer**

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

#**Import the dataset and reframe it as per the goal of the project**

In [None]:
import pandas as pd

df = pd.read_json('data.json')

dataset = {'title':[], 'paragraph': [], 'context': [], 'question': [], 'answer': [], 'answer_start': []}
for i in range(df['data'].shape[0]):
  paragraph_count = 0
  for j in range(len(df['data'][i]['paragraphs'])):
    for k in range(len(df['data'][i]['paragraphs'][j]['qas'])):
      for l in range(len(df['data'][i]['paragraphs'][j]['qas'][k]['answers'])):
        dataset['title'].append(df['data'][i]['title'])
        dataset['paragraph'].append(paragraph_count)
        dataset['context'].append(df['data'][i]['paragraphs'][j]['context'])
        dataset['question'].append(df['data'][i]['paragraphs'][j]['qas'][k]['question'])
        dataset['answer'].append(df['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['text'])
        dataset['answer_start'].append(df['data'][i]['paragraphs'][j]['qas'][k]['answers'][l]['answer_start'])
    paragraph_count += 1

data = pd.DataFrame.from_dict(dataset)

train = []
for i in range(len(dataset['question'])):
    train.append({
        'context': dataset['context'][i],
        'question': dataset['question'][i],
        'answer': {
            'text': dataset['answer'][i],
            'answer_start': dataset['answer_start'][i]}
        })

train[0]

#**Definition to pre-process the training data**

In [None]:
# We need the input as follows ==> [<START> "Question here..." <SEP> "Context here..." <END>]
# And the output should be like ==> [{start-of-answer: <INT>, end-of-answer: <INT>}]

def preprocess(list_items):
  # Tokenizing the question and context
  # Pointers:
  # 1. max_length: To restrict the "question + context" length to 384
  # 2. truncation: To truncate from the "context" part off the token if length exceeds 384
  # 3. return_offsets_mapping: offset_mapping is ==> String: "This is a sentence" --> Token: [<START>, 0, 1, 2, 3, <END>] --> offset_list: [(0, 0), (0, 4), (5, 7), (8, 9), (10, 17), (18, 18)]
  inputs = tokenizer(
        list_items['question'],
        list_items["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

  # The code below is to store the start and end positions of the answers
  ##############################################################################
  sequence_ids = inputs.sequence_ids(0)

  offset_mapping = inputs.pop("offset_mapping")
  answer = list_items["answer"]
  start_positions = 0
  end_positions = 0

  start_char = answer["answer_start"]
  end_char = answer["answer_start"] + len(answer["text"])

  # Find the start and end of the context
  idx = 0
  while sequence_ids[idx] != 1:
    idx += 1
  context_start = idx
  while sequence_ids[idx] == 1:
    idx += 1
  context_end = idx - 1

  # If the answer is not fully inside the context, label it (0, 0)
  if offset_mapping[context_start][0] > end_char or offset_mapping[context_end][1] < start_char:
    start_positions = 0
    end_positions = 0
  else:
    # Otherwise it's the start and end token positions
    idx = context_start
    while idx <= context_end and offset_mapping[idx][0] <= start_char:
        idx += 1
    start_positions = (idx - 1)

    idx = context_end
    while idx >= context_start and offset_mapping[idx][1] >= end_char:
      idx -= 1
    end_positions = (idx + 1)

  inputs["start_positions"] = start_positions
  inputs["end_positions"] = end_positions
  ##############################################################################

  return inputs

#**Using 'map' to apply the definition made above to every training data**

In [None]:
tokenized_items = list(map(preprocess, train))

#**Importing the 'model' and Preparing the training data**

In [None]:
from transformers import TFAutoModelForQuestionAnswering
model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

#**Preparing a dictionary of the training data**

In [None]:
import numpy as np

X = {'input_ids': [], 'attention_mask': []}
Y = {'start_positions': [], 'end_positions': []}

for i in tokenized_items:
  X['input_ids'].append(np.array(i['input_ids']))
  X['attention_mask'].append(np.array(i['attention_mask']))

  Y['start_positions'].append(i['start_positions'])
  Y['end_positions'].append(i['end_positions'])

X['input_ids'] = np.array(X['input_ids'])
X['attention_mask'] = np.array(X['attention_mask'])

Y['start_positions'] = np.array(Y['start_positions'])
Y['end_positions'] = np.array(Y['end_positions'])

#**Creating the optimizer before training**

In [None]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 1
total_train_steps = (len(tokenized_items) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

#**Compiling the model created**

In [None]:
import tensorflow as tf
model.compile(optimizer=optimizer)
model.summary()

#**Training**

In [None]:
model.fit(x=X, y=Y, epochs=1)

#**Validating the output**

In [None]:
context = """
The hall housed multiple classrooms and science labs needed for early research at the university. In 1919 Father James Burns became president of
Notre Dame, and in three years he produced an academic revolution that brought the school up to national standards by adopting the elective system
and moving away from the university's traditional scholastic and classical emphasis. By contrast, the Jesuit colleges, bastions of academic
conservatism, were reluctant to move to a system of electives. Their graduates were shut out of Harvard Law School for that reason. Notre Dame
continued to grow over the years, adding more colleges, programs, and sports teams. By 1921, with the addition of the College of Commerce, Notre Dame
had grown from a small college to a university with five colleges and a professional law school. The university continued to expand and add new
residence halls and buildings with each subsequent president. One of the main driving forces in the growth of the University was its football team,
the Notre Dame Fighting Irish. Knute Rockne became head coach in 1918. Under Rockne, the Irish would post a record of 105 wins, 12 losses, and five
ties. During his 13 years the Irish won three national championships, had five undefeated seasons, won the Rose Bowl in 1925, and produced players
such as George Gipp and the "Four Horsemen". Knute Rockne has the highest winning percentage (.881) in NCAA Division I/FBS football history. Rockne's
offenses employed the Notre Dame Box and his defenses ran a 7–2–2 scheme. The last game Rockne coached was on December 14, 1930 when he led a group of
Notre Dame all-stars against the New York Giants in New York City.
"""
questions = ["What was the amount of wins Knute Rockne attained at Notre Dame while head coach?",
            "Over how many years did the change to national standards undertaken at Notre Dame in the early 20th century take place?"]

for question in questions:
  if len(context) < 100:
    inputs = tokenizer(question, context, return_tensors="tf")
    outputs = model(**inputs)
    answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
    answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    print(tokenizer.decode(predict_answer_tokens))
  else:
    ans = "Nothing to print!!"
    Weight = float('-inf')
    for i in range(100, len(context.split())):
      temp = " ".join(context.split()[i-100 : i+1])
      inputs = tokenizer(question, temp, return_tensors="tf")
      outputs = model(**inputs)
      if Weight < (int(max(outputs.start_logits[0])) + int(max(outputs.end_logits[0]))):
        Weight = (int(max(outputs.start_logits[0])) + int(max(outputs.end_logits[0])))
        answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
        answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
        predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
        ans = tokenizer.decode(predict_answer_tokens)
      print(ans)