In [None]:
# Downloading the hugginface library and dataset module

!pip install transformers --quiet
!pip install datasets --quiet

# Importing libraries

import tensorflow as tf
import json
import operator
from keras.mixed_precision import set_global_policy
from keras.optimizers import Adam
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline, TFAutoModelForQuestionAnswering

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Loading the dataset and print the first row
data = load_dataset('squad')
demo_df = pd.DataFrame(data['train'][0],
             columns = ['context', 'question', 'answers'])
print(f"The Context: \n{demo_df['context'][0]}")
print(f"The Question: \n{demo_df['question'][0]}")
print(f"The position of the answer: \n{demo_df['answers'][0][0]}")
print(f"The actual answer: \n{demo_df['answers'][1][0]}")

# Calling the model and tokenizer
model_name = 'distilroberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Hyperparameters

max_length = 400
stride = 100
batch_size = 32
learning_rate = 3e-5
epochs = 2

# Creating a function to get the index of last value of a sequence

def last_occurance(seq, val):
    last_element_id = len(seq) - operator.indexOf(reversed(seq), val) - 1
    return last_element_id

# Creating dataset funtion to vectorize dataset and create target variables

def create_dataset(data):
    data['question'] = [question.lstrip() for question in data['question']]
    data['context'] = [context.lstrip() for context in data['context']]

    # tokenizing the whole dataset based on context and questions
    tokenized_dataset = tokenizer(data['question'],
                                  data['context'],
                                  truncation = 'only_second',
                                  max_length = max_length,
                                  stride = stride,
                                  return_overflowing_tokens = True,
                                  return_offsets_mapping = True,
                                  padding = 'max_length')

    # Creating empty list for storing the starting and ending position of answers

    tokenized_dataset['start_positions'] = []
    tokenized_dataset['end_positions'] = []

    # Looping through all the sequences to find the starting and ending position

    for seq_id in range(len(tokenized_dataset['input_ids'])):

        seq_ids = tokenized_dataset.sequence_ids(seq_id)
        offset_maps = tokenized_dataset['offset_mapping'][seq_id]
        overflow_maps = tokenized_dataset['overflow_to_sample_mapping'][seq_id]

        # Getting the starting and ending index of the answer

        answer = data['answers'][overflow_maps]
        answer_text = answer['text'][0]
        answer_start = answer['answer_start'][0]
        answer_end = answer_start + len(answer_text)

        # Getting the current start and end index of string

        start_pos = seq_ids.index(1)
        end_pos = last_occurance(seq_ids, 1)

        # Searching the starting and ending position with loop

        starting_i = 0
        ending_j = 0

        if (offset_maps[start_pos][0] <= answer_start and
            offset_maps[end_pos][1] >= answer_end):

            i = start_pos

            while offset_maps[i][0] < answer_start:
                i += 1
            if offset_maps[i][0] == answer_start:
                starting_i = i
            else:
                starting_i = i - 1

            j = end_pos
            while offset_maps[j][1] > answer_end:
                j -= 1
            if offset_maps[j][1] == answer_end:
                ending_j = j
            else:
                ending_j = j + 1


        # Storing the start and end position of answer in the list

        tokenized_dataset['start_positions'].append(starting_i)
        tokenized_dataset['end_positions'].append(ending_j)

    return tokenized_dataset

# Creating the final tokenized dataset after removing tge train context part

tokenized_dataset = data.map(create_dataset,
                             batched = True,
                             remove_columns = data['train'].column_names)

# Removing the offset_mapping and overflow_to_sample_mapping columns as well

dataset = tokenized_dataset.remove_columns(['offset_mapping',
                                            'overflow_to_sample_mapping'])

# Creating train and validation dataset

train_dataset = dataset['train'].to_tf_dataset(batch_size = batch_size)
valid_dataset = dataset['validation'].to_tf_dataset(batch_size = batch_size)

# Finally downloading a pre-trained model

model = TFAutoModelForQuestionAnswering.from_pretrained(model_name) # the head is not trained yet for specific tasks

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

The Context: 
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
The Question: 
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
The position of the answer: 
515
The actual answer: 
Saint Bernadette Soubirous


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFRobertaForQuestionAnswering.

Some weights or buffers of the TF 2.0 model TFRobertaForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Getting global policy to run both 16 and 32 bit float tensors in same model

set_global_policy('mixed_float16')

# Compiling model with custom learning rate

model.compile(optimizer = Adam(learning_rate = learning_rate))

# Fitting model with the tokenized dataset

model.fit(train_dataset,
          validation_data = valid_dataset,
          epochs = epochs)

# Saving the model and its all weights

model.save('Question_Answer_system_transformer')
print('Model Saved!')

Epoch 1/2
Epoch 2/2
Model Saved!


In [None]:
# Creating a function for getting the answer from the context

def answer_generator(question,
                     context,
                     tokenizer = tokenizer,
                     model = model):

    input_seq = tokenizer([question], [context], return_tensors = 'np')

    output_seq = model(input_seq)

    # Getting the highest logits for greater probability
    starting_index = tf.argmax(output_seq.start_logits, axis = 1)
    ending_index = tf.argmax(output_seq.end_logits, axis = 1)
    answer_seq = input_seq['input_ids'][0, int(starting_index): int(ending_index) + 1]
    sentence = tokenizer.decode(answer_seq).strip()

    return sentence

context = '''
Hugging Face was founded in 2016 by Clément Delangue, Julien Chaumond, and
Thomas Wolf originally as a company that developed a chatbot app targeted at
teenagers. After open-sourcing the model behind the chatbot, the company
pivoted to focus on being a platform for democratizing machine learning. In March
2021, Hugging Face raised $40 million in a Series B funding round.
'''
question = "Who are the Hugging Face founders?"
answer = answer_generator(question = question,
                          context = context)
print(f'Answer: {answer}')

Answer: Clément Delangue, Julien Chaumond, and
Thomas Wolf


In [None]:
!wget /content/Question_Answer_system_transformer

/content/Question_Answer_system_transformer: Scheme missing.
