In [3]:
! pip install datasets transformers

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/94/f8/ff7cd6e3b400b33dcbbfd31c6c1481678a2b2f669f521ad20053009a9aa3/datasets-1.7.0-py3-none-any.whl (234kB)
[K     |████████████████████████████████| 235kB 8.0MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 11.3MB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/7d/4f/0a862cad26aa2ed7a7cd87178cbbfa824fc1383e472d63596a0d018374e7/xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243kB)
[K     |████████████████████████████████| 245kB 50.6MB/s 
Collecting huggingface-hub<0.1.0
  Downloading https://files.pythonhosted.org/packages/32/a1/7c5261396da23ec364e296a4fb8a1cd6a5a2ff457215c6447038f18c0309/huggingface_hub-0.0.9-py3-none-any.whl
Collecting fsspec
[?25l  Downloading htt

**Using Squad 2 Dataset**

**Using Model: distilbert-base-uncased-distilled-squad and fine tuning**

In [2]:
squad_v2 = True
model_checkpoint = "distilbert-base-uncased-distilled-squad"
batch_size = 32

In [3]:
from datasets import load_dataset
datasets = load_dataset("squad_v2" if squad_v2 else "squad")


Reusing dataset squad_v2 (/root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/ba48bc29b974701e9ba8d80ac94f3e3df924aba41b764dcf9851debea7c672e4)


In [5]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

## Due to Time Constrant Traning model on fewer row
**Meanwhile will be training on whole dataset**

In [6]:
datasets_train = datasets['train'].select(range(1,50000))
datasets_valid = datasets['validation'].select(range(1,4000))

In [7]:
datasets_valid

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 3999
})

In [8]:
datasets_train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 49999
})

# **Pre Processing Dataset**

In [10]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
import transformers
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [14]:
pad_on_right = tokenizer.padding_side == "right"
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [13]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

### Tokenizing Train and Validiation dataset for model training 

In [15]:
tokenized_datasets_train = datasets_train.map(prepare_train_features, batched=True, remove_columns=datasets_train.column_names)
tokenized_datasets_valid= datasets_valid.map(prepare_train_features, batched=True, remove_columns=datasets_valid.column_names)

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




## **Model Tranining**

In [11]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

In [17]:
from transformers import default_data_collator

data_collator = default_data_collator

In [15]:
args = TrainingArguments(
    f"test-squad",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=4,
    weight_decay=0.01,
)

In [18]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets_train,
    eval_dataset=tokenized_datasets_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8729,1.267792
2,0.6729,1.397688
3,0.5173,1.660408
4,0.4305,1.900364


TrainOutput(global_step=6312, training_loss=0.6355679007990732, metrics={'train_runtime': 8021.0089, 'train_samples_per_second': 0.787, 'total_flos': 1525519407481344.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 2496548864, 'init_mem_gpu_alloc_delta': 266590720, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -67239936, 'train_mem_gpu_alloc_delta': 798102528, 'train_mem_cpu_peaked_delta': 175030272, 'train_mem_gpu_peaked_delta': 8645762048})

## **Model Evaluation**

In [20]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

odict_keys(['loss', 'start_logits', 'end_logits'])

In [21]:
output.start_logits.shape, output.end_logits.shape

(torch.Size([32, 384]), torch.Size([32, 384]))

In [22]:
output.start_logits.argmax(dim=-1), output.end_logits.argmax(dim=-1)

(tensor([ 37,  72,  80, 157,  17,  44,  13, 152, 202, 119,  52,  22,  31,  32,
         120, 252,  93,  90,   0,  90,  56,  88, 152,  23,   0,   0, 138,   0,
           0, 143,  23,  72], device='cuda:0'),
 tensor([ 40,  76,  81, 157,  19,  44, 101, 153, 204, 120,  52,  26,  33,  34,
         121, 252,  98,  91,   0,  91,  56,  94, 153,  24,   0,   0, 142,  16,
           0, 140,  23,  72], device='cuda:0'))

In [23]:
n_best_size = 20

**Sorting the valid_answers according to their score and only keep the best one.**

In [24]:
import numpy as np

start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "" # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

In [25]:
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [26]:
validation_features = datasets_valid.map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets_valid.column_names
)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [27]:
raw_predictions = trainer.predict(validation_features)

In [28]:
max_answer_length = 30

In [29]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [30]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
context = datasets_valid[0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
            start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or offset_mapping[start_index] is None
            or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

[{'score': 17.294083, 'text': '10th and 11th centuries'},
 {'score': 14.917404, 'text': 'the 10th and 11th centuries'},
 {'score': 14.2580185, 'text': 'in the 10th and 11th centuries'},
 {'score': 13.998373, 'text': '10th and 11th'},
 {'score': 11.621695, 'text': 'the 10th and 11th'},
 {'score': 10.962309, 'text': 'in the 10th and 11th'},
 {'score': 10.142585, 'text': '10th'},
 {'score': 9.887374,
  'text': '10th and 11th centuries gave their name to Normandy'},
 {'score': 9.787367, 'text': '11th centuries'},
 {'score': 8.535857,
  'text': '10th and 11th centuries gave their name to Normandy, a region in France.'},
 {'score': 8.094827, 'text': 'centuries'},
 {'score': 7.7659073, 'text': 'the 10th'},
 {'score': 7.5106964,
  'text': 'the 10th and 11th centuries gave their name to Normandy'},
 {'score': 7.424391,
  'text': '10th and 11th centuries gave their name to Normandy,'},
 {'score': 7.1065216, 'text': 'in the 10th'},
 {'score': 7.016807, 'text': 'and 11th centuries'},
 {'score': 6.

In [31]:
datasets_valid[0]["answers"]

{'answer_start': [94, 87, 94, 94],
 'text': ['10th and 11th centuries',
  'in the 10th and 11th centuries',
  '10th and 11th centuries',
  '10th and 11th centuries']}

**Prediction Answer is matchning with orignal Answer**

In [32]:
import collections

examples = datasets_valid
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

### **Post Processing the for Validiation datase**

In [33]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions

In [34]:
final_predictions = postprocess_qa_predictions(datasets_valid, validation_features, raw_predictions.predictions)

Post-processing 3999 example predictions split into 4186 features.


HBox(children=(FloatProgress(value=0.0, max=3999.0), HTML(value='')))




In [35]:
from datasets import load_metric

metric = load_metric("squad_v2")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2264.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3182.0, style=ProgressStyle(description…




In [36]:
formatted_predictions = [{"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in final_predictions.items()]
references = [{"id": ex["id"], "answers": ex["answers"]} for ex in datasets_valid]
metric.compute(predictions=formatted_predictions, references=references)

{'HasAns_exact': 72.19020172910663,
 'HasAns_f1': 79.34943683668398,
 'HasAns_total': 2082,
 'NoAns_exact': 45.74856546687533,
 'NoAns_f1': 45.74856546687533,
 'NoAns_total': 1917,
 'best_exact': 59.53988497124281,
 'best_exact_thresh': 0.0,
 'best_f1': 63.24219242159929,
 'best_f1_thresh': 0.0,
 'exact': 59.51487871967992,
 'f1': 63.24219242159934,
 'total': 3999}

## **Results**
**F1 Score:** 79.2<br>
**Exact Match** 72.38

# **Test on Real Data**
Loading Model previously trained and uploaded

In [13]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
a = AutoModelForQuestionAnswering.from_pretrained("Sarmad/projectmodel-bert")

In [20]:
from transformers import pipeline
device = "cpu"
model = model.to(device)
nlp = pipeline('question-answering', model=a, tokenizer=tokenizer)

In [21]:
context = "Pakistan, officially the Islamic Republic of Pakistan, is a country in South Asia. It is the world's fifth-most populous country with a population exceeding 212.2 million, and has the world's second-largest Muslim population. Pakistan is the 33rd-largest country by area, spanning 881,913 square kilometres (340,509 square miles). It has a 1,046-kilometre (650-mile) coastline along the Arabian Sea and Gulf of Oman in the south and is bordered by India to the east, Afghanistan to the west, Iran to the southwest, and China to the northeast. It is separated narrowly from Tajikistan by Afghanistan's Wakhan Corridor in the northwest, and also shares a maritime border with Oman."
nlp({'question': 'What is population of pakistan?',
    'context': context
})

{'answer': '212.2 million',
 'end': 170,
 'score': 0.9826532602310181,
 'start': 157}

# **Testing passed as our trained model is able to answer the question**

# **Model Interpretation**
In this section we will explain the model intretaibility by showing the impact of different words in the context to predict the answer 

In [1]:
!pip install transformers-interpret



In [14]:
## Using Transformer Interpratability feature
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers_interpret import QuestionAnsweringExplainer

qa_explainer = QuestionAnsweringExplainer(
    a,
    tokenizer,
)

context = "Pakistan, officially the Islamic Republic of Pakistan, is a country in South Asia. It is the world's fifth-most populous country with a population exceeding 212.2 million, and has the world's second-largest Muslim population. Pakistan is the 33rd-largest country by area, spanning 881,913 square kilometres (340,509 square miles). It has a 1,046-kilometre (650-mile) coastline along the Arabian Sea and Gulf of Oman in the south and is bordered by India to the east, Afghanistan to the west, Iran to the southwest, and China to the northeast. It is separated narrowly from Tajikistan by Afghanistan's Wakhan Corridor in the northwest, and also shares a maritime border with Oman."

word_attributions = qa_explainer(
    "What is population of pakistan?",
    context,
)

In [15]:
word_attributions

{'end': [('[CLS]', 0.0),
  ('what', 0.15424868226968882),
  ('is', 0.5061326649096628),
  ('population', 0.18439870854600854),
  ('of', 0.19496920947670474),
  ('pakistan', 0.007321796425564715),
  ('?', 0.3450449398674029),
  ('[SEP]', 0.0),
  ('pakistan', 0.2874934320675912),
  (',', 0.1413407325841827),
  ('officially', -0.04538514541834827),
  ('the', 0.02856252332424582),
  ('islamic', -0.023388571856510776),
  ('republic', 0.03091660614841762),
  ('of', 0.052065849481055054),
  ('pakistan', -0.09739403871995603),
  (',', 0.023945703813691138),
  ('is', 0.03336336440549612),
  ('a', 0.019829004287706377),
  ('country', -0.019078734028972193),
  ('in', 0.0037605614327462372),
  ('south', 0.018160418474117333),
  ('asia', 0.04673556953124185),
  ('.', 0.07136187789647533),
  ('it', 0.015950537285192665),
  ('is', 0.12399277208318554),
  ('the', -0.01094992894864839),
  ('world', 0.043794189884760436),
  ("'", -0.10306753707343677),
  ('s', -0.120894970451987),
  ('fifth', -0.0409804

In [16]:
qa_explainer.predicted_answer

'212 . 2 million'

In [17]:
qa_explainer.visualize("bert_qa_viz.html")

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
212 (39),212 . 2 million (7.20),212 (39),1.41,"[CLS] what is population of pakistan ? [SEP] pakistan , officially the islamic republic of pakistan , is a country in south asia . it is the world ' s fifth - most populous country with a population exceeding 212 . 2 million , and has the world ' s second - largest muslim population . pakistan is the 33rd - largest country by area , spanning 88 ##1 , 91 ##3 square kilometres ( 340 , 50 ##9 square miles ) . it has a 1 , 04 ##6 - kilometre ( 650 - mile ) coastline along the arabian sea and gulf of oman in the south and is bordered by india to the east , afghanistan to the west , iran to the southwest , and china to the northeast . it is separated narrowly from tajikistan by afghanistan ' s wa ##khan corridor in the northwest , and also shares a maritime border with oman . [SEP]"
,,,,
million (42),212 . 2 million (7.04),million (42),1.34,"[CLS] what is population of pakistan ? [SEP] pakistan , officially the islamic republic of pakistan , is a country in south asia . it is the world ' s fifth - most populous country with a population exceeding 212 . 2 million , and has the world ' s second - largest muslim population . pakistan is the 33rd - largest country by area , spanning 88 ##1 , 91 ##3 square kilometres ( 340 , 50 ##9 square miles ) . it has a 1 , 04 ##6 - kilometre ( 650 - mile ) coastline along the arabian sea and gulf of oman in the south and is bordered by india to the east , afghanistan to the west , iran to the southwest , and china to the northeast . it is separated narrowly from tajikistan by afghanistan ' s wa ##khan corridor in the northwest , and also shares a maritime border with oman . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
212 (39),212 . 2 million (7.20),212 (39),1.41,"[CLS] what is population of pakistan ? [SEP] pakistan , officially the islamic republic of pakistan , is a country in south asia . it is the world ' s fifth - most populous country with a population exceeding 212 . 2 million , and has the world ' s second - largest muslim population . pakistan is the 33rd - largest country by area , spanning 88 ##1 , 91 ##3 square kilometres ( 340 , 50 ##9 square miles ) . it has a 1 , 04 ##6 - kilometre ( 650 - mile ) coastline along the arabian sea and gulf of oman in the south and is bordered by india to the east , afghanistan to the west , iran to the southwest , and china to the northeast . it is separated narrowly from tajikistan by afghanistan ' s wa ##khan corridor in the northwest , and also shares a maritime border with oman . [SEP]"
,,,,
million (42),212 . 2 million (7.04),million (42),1.34,"[CLS] what is population of pakistan ? [SEP] pakistan , officially the islamic republic of pakistan , is a country in south asia . it is the world ' s fifth - most populous country with a population exceeding 212 . 2 million , and has the world ' s second - largest muslim population . pakistan is the 33rd - largest country by area , spanning 88 ##1 , 91 ##3 square kilometres ( 340 , 50 ##9 square miles ) . it has a 1 , 04 ##6 - kilometre ( 650 - mile ) coastline along the arabian sea and gulf of oman in the south and is bordered by india to the east , afghanistan to the west , iran to the southwest , and china to the northeast . it is separated narrowly from tajikistan by afghanistan ' s wa ##khan corridor in the northwest , and also shares a maritime border with oman . [SEP]"
,,,,


## On Another example from wiki on IBA 

In [18]:
qa_explainer = QuestionAnsweringExplainer(
    a,
    tokenizer,
)

context = "The Institute of Business Administration (IBA), Karachi; is a public university in Karachi, Sindh, Pakistan. IBA was established as a business school in 1955 by Pakistani government with the technical support from the Wharton School and the University of Southern California. Several prominent American professors were assigned to the IBA to develop its curriculum. It remained a constituent school within the University of Karachi until, in 1994, when its status was elevated to an independent chartered university by the Sindh Government. Since 2003, IBA has expanded from a purely graduate business school to an interdisciplinary university with undergraduate, graduate and post-graduate programs."

word_attributions = qa_explainer(
    "When was IBA established?",
    context,
)

In [19]:
word_attributions

{'end': [('[CLS]', 0.0),
  ('when', 0.5755061106809859),
  ('was', 0.461872440562799),
  ('ib', 0.04418517186428221),
  ('##a', -0.009440976565591376),
  ('established', 0.4134521634044602),
  ('?', 0.3353685938221023),
  ('[SEP]', 0.0),
  ('the', 0.029821614166505594),
  ('institute', 0.0026275386604036805),
  ('of', -0.00887673648733349),
  ('business', -0.050588966773128775),
  ('administration', 0.047670504954172695),
  ('(', 0.02171808775986434),
  ('ib', -0.012066656613211981),
  ('##a', 0.02842056996127019),
  (')', -0.002048316574914383),
  (',', -0.045706611631879134),
  ('karachi', -0.007514875547070057),
  (';', 0.05123231092088069),
  ('is', 0.022052518278180746),
  ('a', 0.009207007752038842),
  ('public', -0.02855116651616495),
  ('university', -0.02626590143169026),
  ('in', 0.001906975041161732),
  ('karachi', -0.030754133645374482),
  (',', -0.03164294752246369),
  ('sindh', 0.04794643390478328),
  (',', -0.06403633251792507),
  ('pakistan', 0.008510554473679873),
  ('

In [20]:
qa_explainer.predicted_answer

'1955'

In [21]:
qa_explainer.visualize("bert_qa_viz.html")

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1955 (40),1955 (8.49),1955 (40),2.01,"[CLS] when was ib ##a established ? [SEP] the institute of business administration ( ib ##a ) , karachi ; is a public university in karachi , sindh , pakistan . ib ##a was established as a business school in 1955 by pakistani government with the technical support from the wharton school and the university of southern california . several prominent american professors were assigned to the ib ##a to develop its curriculum . it remained a constituent school within the university of karachi until , in 1994 , when its status was elevated to an independent chartered university by the sindh government . since 2003 , ib ##a has expanded from a purely graduate business school to an interdisciplinary university with undergraduate , graduate and post - graduate programs . [SEP]"
,,,,
1955 (40),1955 (8.13),1955 (40),1.42,"[CLS] when was ib ##a established ? [SEP] the institute of business administration ( ib ##a ) , karachi ; is a public university in karachi , sindh , pakistan . ib ##a was established as a business school in 1955 by pakistani government with the technical support from the wharton school and the university of southern california . several prominent american professors were assigned to the ib ##a to develop its curriculum . it remained a constituent school within the university of karachi until , in 1994 , when its status was elevated to an independent chartered university by the sindh government . since 2003 , ib ##a has expanded from a purely graduate business school to an interdisciplinary university with undergraduate , graduate and post - graduate programs . [SEP]"
,,,,


True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
1955 (40),1955 (8.49),1955 (40),2.01,"[CLS] when was ib ##a established ? [SEP] the institute of business administration ( ib ##a ) , karachi ; is a public university in karachi , sindh , pakistan . ib ##a was established as a business school in 1955 by pakistani government with the technical support from the wharton school and the university of southern california . several prominent american professors were assigned to the ib ##a to develop its curriculum . it remained a constituent school within the university of karachi until , in 1994 , when its status was elevated to an independent chartered university by the sindh government . since 2003 , ib ##a has expanded from a purely graduate business school to an interdisciplinary university with undergraduate , graduate and post - graduate programs . [SEP]"
,,,,
1955 (40),1955 (8.13),1955 (40),1.42,"[CLS] when was ib ##a established ? [SEP] the institute of business administration ( ib ##a ) , karachi ; is a public university in karachi , sindh , pakistan . ib ##a was established as a business school in 1955 by pakistani government with the technical support from the wharton school and the university of southern california . several prominent american professors were assigned to the ib ##a to develop its curriculum . it remained a constituent school within the university of karachi until , in 1994 , when its status was elevated to an independent chartered university by the sindh government . since 2003 , ib ##a has expanded from a purely graduate business school to an interdisciplinary university with undergraduate , graduate and post - graduate programs . [SEP]"
,,,,


# **Uploading Model**
In this section we upload the fine tuned model on Hugging face directly calling it and using for question answering

In [40]:
! transformers-cli login

2021-05-30 10:44:11.405599: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0

        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        
Username: sarmadzafar10@gmail.com
Password: 
Login successful
Your token: eZPYbLmNaDgSPuQpnlNrNgaqJDKkzRqFrkKPMLtBoWUMhthCbMQHqCbcWZlpwnUNZUROKeCujyvhNfTsIFnJDBPugWPXqDivkWxZMNjLeosggrwLNAWOAaxiTKwWvPEw 

Your token has been saved to /root/.huggingface

In [41]:
! sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (2,935 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-p

In [63]:
!git init

Initialized empty Git repository in /content/.git/


In [42]:
! transformers-cli repo create projectmodel-bert

2021-05-30 10:45:09.446914: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
[90mgit version 2.17.1[0m
Error: unknown flag: --version

[90mSorry, no usage text found for "git-lfs"[0m

You are about to create [1mSarmad/projectmodel-bert[0m
Proceed? [Y/n] Y

Your repo now lives at:
  [1mhttps://huggingface.co/Sarmad/projectmodel-bert[0m

You can clone it locally with the command below, and commit/push as usual.

  git clone https://huggingface.co/Sarmad/projectmodel-bert



In [43]:
!git clone https://huggingface.co/Sarmad/projectmodel-bert

Cloning into 'projectmodel-bert'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0)[K
Unpacking objects: 100% (3/3), done.


In [64]:
!cd projectmodel-bert
!echo "hello" >> README.md
!git add . && git commit -m "Update from $USER"

hint: You've added another git repository inside your current repository.
hint: Clones of the outer repository will not contain the contents of
hint: the embedded repository and will not know how to obtain it.
hint: If you meant to add a submodule, use:
hint: 
hint: 	git submodule add <url> projectmodel-bert
hint: 
hint: If you added this path by mistake, you can remove it from the
hint: index with:
hint: 
hint: 	git rm --cached projectmodel-bert
hint: 
hint: See "git help submodule" for more information.

a
^C


In [53]:
model.save_pretrained("https://huggingface.co/Sarmad/projectmodel-bert")

In [54]:
tokenizer.save_pretrained("https://huggingface.co/Sarmad/projectmodel-bert")

('https://huggingface.co/Sarmad/projectmodel-bert/tokenizer_config.json',
 'https://huggingface.co/Sarmad/projectmodel-bert/special_tokens_map.json',
 'https://huggingface.co/Sarmad/projectmodel-bert/vocab.txt',
 'https://huggingface.co/Sarmad/projectmodel-bert/added_tokens.json',
 'https://huggingface.co/Sarmad/projectmodel-bert/tokenizer.json')

In [71]:
tokenizer.push_to_hub("projectmodel-bert")

In [72]:
model.push_to_hub("projectmodel-bert")

In [68]:
!git config --global user.email "sarmadzafar10@gmail.com"

In [70]:
!git config --global user.name "Sarmad"

In [86]:

! git add --all -n
! git commit -m 

add '.config/.last_opt_in_prompt.yaml'
add '.config/.last_survey_prompt.yaml'
add '.config/.last_update_check.json'
add '.config/active_config'
add '.config/config_sentinel'
add '.config/configurations/config_default'
add '.config/gce'
add '.config/logs/2021.05.06/13.43.04.692209.log'
add '.config/logs/2021.05.06/13.43.23.909017.log'
add '.config/logs/2021.05.06/13.43.39.026092.log'
add '.config/logs/2021.05.06/13.43.44.620859.log'
add '.config/logs/2021.05.06/13.44.00.991142.log'
add '.config/logs/2021.05.06/13.44.01.543195.log'
add 'README.md'
add 'https:/huggingface.co/Sarmad/projectmodel-bert/config.json'
add 'https:/huggingface.co/Sarmad/projectmodel-bert/pytorch_model.bin'
add 'https:/huggingface.co/Sarmad/projectmodel-bert/special_tokens_map.json'
add 'https:/huggingface.co/Sarmad/projectmodel-bert/tokenizer.json'
add 'https:/huggingface.co/Sarmad/projectmodel-bert/tokenizer_config.json'
add 'https:/huggingface.co/Sarmad/projectmodel-bert/vocab.txt'
hint: You've added another gi

In [81]:
! git add .
! git commit -m 

hint: You've added another git repository inside your current repository.
hint: Clones of the outer repository will not contain the contents of
hint: the embedded repository and will not know how to obtain it.
hint: If you meant to add a submodule, use:
hint: 
hint: 	git submodule add <url> projectmodel-bert
hint: 
hint: If you added this path by mistake, you can remove it from the
hint: index with:
hint: 
hint: 	git rm --cached projectmodel-bert
hint: 
hint: See "git help submodule" for more information.
--all
^C
error: switch `m' requires a value
usage: git commit [<options>] [--] <pathspec>...

    -q, --quiet           suppress summary after successful commit
    -v, --verbose         show diff in commit message template

Commit message options
    -F, --file <file>     read message from file
    --author <author>     override author for commit
    --date <date>         override date for commit
    -m, --message <message>
                          commit message
    -c, --reedit-me