In [2]:
!pip install transformers

/bin/bash: /root/miniconda3/envs/qa_transformer/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [3]:
!pip install datasets
!pip install huggingface-hub

/bin/bash: /root/miniconda3/envs/qa_transformer/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m/bin/bash: /root/miniconda3/envs/qa_transformer/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [4]:
!pip install cchardet

/bin/bash: /root/miniconda3/envs/qa_transformer/lib/libtinfo.so.6: no version information available (required by /bin/bash)
[0m

In [5]:
from datasets import load_dataset

datasets = load_dataset("squad")

In [6]:
print(datasets["train"][0])

{'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}


In [7]:
from transformers import AutoTokenizer

model_checkpoint = 'distilbert-base-cased'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
max_length = 384 # max length of a feature - context or question
doc_stride = (
    128 # the overlap between 2 consecutive truncated responses
)

In [9]:
def prepare_train_features(examples):
    # remove the white spaces from the exaple's question and context
    
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    
    # tokenize the examples using the tokenizer with the overflowing tokens creating a new sample with an overlap with the previous sample
    
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    # stores the mapping from a feature to the corresponding example that it was part of after truncating
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    
    # stores the starting and ending char position mapping for each token.
    
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        # we will label impossible answers with the CLS token index
        # impossible answer happens only when we have broken the context to multiple values (because of max length) and 
        # the answer does not lie in this portion of the brokem context 
        
        # all the input ids in the tokenized examples
        
        input_ids = tokenized_examples["input_ids"][i]
        
        # the index of the starting token in the tokenized input
        
        cls_index = input_ids.index(tokenizer.cls_token_id)
        
        # grab the sequence corresponding to that example (to know the context and the question)
        
        sequence_ids = tokenized_examples.sequence_ids(i)
        
        # one span of text from the context can be part of any example. the corresponding example is in the sample_mapping list
        # [0, 0, 0, 1, 2, 2, 2, 2, 2, 3, 4, 4, 5, 6, 7, 7, 7]
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        # if there are no answers to the question in the data file only then set the CLS as the answer
        
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
            
        else:
            # check the starting and the end character index of the answer
            
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            # start and end token index of this part of span
            
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
                
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1
                
            
            # detecting if the answer is out of this context, if yes then labeling the feature with CLS index
            
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
                
            else:
                # move the token_start_index and the token_end_index to the starting and the ending of the answer in the context
                
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                
                tokenized_examples["start_positions"].append(token_start_index-1)
                
                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                    
                tokenized_examples["end_positions"].append(token_end_index+1)
            
            

    return tokenized_examples

In [10]:
tokenized_datasets = datasets.map(
    prepare_train_features,
    batched = True,
    remove_columns=datasets['train'].column_names,
    num_proc = 3
)

In [11]:
train_set = tokenized_datasets['train'].with_format("numpy")[:]
validation_set = tokenized_datasets['validation'].with_format("numpy")[:]

In [12]:
from transformers import TFAutoModelForQuestionAnswering

model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

2023-07-31 15:26:51.827796: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-07-31 15:26:52.385686: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/root/miniconda3/envs/qa_transformer/lib/
2023-07-31 15:26:52.387646: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/root/miniconda3/envs/qa_transformer/lib/
2023-07-31 15:26:53.006841: I tensorflow/com

In [13]:
import tensorflow as tf
from tensorflow import keras

optimizer = keras.optimizers.Adam(learning_rate=5e-5)

In [14]:
keras.mixed_precision.set_global_policy("mixed_float16")

model.compile(optimizer=optimizer)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3080, compute capability 8.6


2023-07-31 15:31:05.662532: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:0a:00.0/numa_node
Your kernel may have been built without NUMA support.


In [17]:
model.fit(train_set, validation_data = validation_set, batch_size = 16, epochs=1)

2023-07-31 15:39:07.995155: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x6d7bcc10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-07-31 15:39:07.995182: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 3080, Compute Capability 8.6
2023-07-31 15:39:07.998191: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-07-31 15:39:08.094824: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-07-31 15:39:08.168629: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




<keras.callbacks.History at 0x7f29202cea00>

In [104]:
context = "The discovery of the United States is a complex topic, largely because the land was already inhabited by diverse groups of indigenous peoples for thousands of years prior to the arrival of European explorers. However, in the context of European history, the discovery of America is typically attributed to the Italian explorer Christopher Columbus. Commissioned by the Spanish monarchy, Columbus set sail in 1492 hoping to find a new trade route to Asia. Instead, he landed in the Bahamas in the Caribbean, marking the first sustained encounter between the peoples of the Eastern and Western hemispheres and beginning a period of extensive European exploration and eventual colonization of the Americas."
question = "Who discovered america?"

inputs = tokenizer([context], [question], return_tensors="np")
outputs = model(inputs)
start_position = tf.argmax(outputs.start_logits, axis=1)
end_position = tf.argmax(outputs.end_logits, axis=1)
print(int(start_position), int(end_position[0]))

57 58


In [105]:
answer = inputs["input_ids"][0, int(start_position) : int(end_position) + 1]
print(answer)

[4978 8555]


In [None]:
print(tokenizer.decode(answer))

In [113]:
model.push_to_hub("transformers-question-answer", organization="sashakttripathi")
tokenizer.push_to_hub("transformers-question-answer", organization="sashakttripathi")

tf_model.h5:   0%|          | 0.00/261M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sashakttripathi/transformers-question-answer/commit/f0d3665cfb4f10adc5b9003a3ac0d41aa751127b', commit_message='Upload tokenizer', commit_description='', oid='f0d3665cfb4f10adc5b9003a3ac0d41aa751127b', pr_url=None, pr_revision=None, pr_num=None)