In [1]:
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


# **INSTALLATION AND IMPORTS**

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 32.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 46.9 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 60.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.0 tokenizers-0.12.1 transformers-4.22.2


In [3]:
!pip install anvil-uplink

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting anvil-uplink
  Downloading anvil_uplink-0.4.0-py2.py3-none-any.whl (88 kB)
[K     |████████████████████████████████| 88 kB 7.4 MB/s 
Collecting argparse
  Downloading argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Collecting ws4py
  Downloading ws4py-0.5.1.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 258 kB/s 
Building wheels for collected packages: ws4py
  Building wheel for ws4py (setup.py) ... [?25l[?25hdone
  Created wheel for ws4py: filename=ws4py-0.5.1-py3-none-any.whl size=45229 sha256=4763e6da1585f7fbc3ff87258bec8cb186a37b9a27905a18951cc92ba5b607fb
  Stored in directory: /root/.cache/pip/wheels/29/ea/7d/3410aa0aa0e4402ead9a7a97ab2214804887e0f5c2b76f0c96
Successfully built ws4py
Installing collected packages: ws4py, argparse, anvil-uplink
Successfully installed anvil-uplink-0.4.0 argparse-1.4.0 ws4py-0.5.1


In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re
import pickle
import transformers
import collections
import warnings
import anvil.server
warnings.filterwarnings("ignore")
def warn(*args, **kwargs):
    pass
warnings.warn = warn

anvil.server.connect('36HRIW6HZ3N257K6IP6QUECM-URT6IC7IPZJ3RXCO')

Connecting to wss://anvil.works/uplink
Anvil websocket open
Connected to "Default environment" as SERVER


# **POSITIONAL ENCODING**

In [5]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)


# **MASKING**

In [6]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask # (seq_len, seq_len)

# **SELF ATTENTION**

In [7]:
def scaled_dot_product_attention(q, k, v, mask):

    matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
    
    # scale matmul_qk
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
    
    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  
    # softmax is normalized on the last axis (seq_len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q, seq_len_k)

    output = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)
    return output, attention_weights

# **MULTI HEAD ATTENTION**

In [8]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)   # (batch_size, seq_len, d_model)
        k = self.wk(k)   # (batch_size, seq_len, d_model)
        v = self.wv(v)   # (batch_size, seq_len, d_model) 

        q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_q, depth)
        v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_q, depth)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
            
        return output, attention_weights

# **FEED FORWARD NEURAL NETWORK**

In [9]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

# **ENCODER LAYER**

In [10]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


# **DECODER LAYER**

In [11]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2


# **ENCODER**


In [12]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
    
        return x


# **DECODER**

In [13]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
        return x, attention_weights


# **TRANSFORMER**


In [14]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)
        
        self.lstm = tf.keras.layers.LSTM(num_layers)

        self.max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding='same')
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
      
        op=self.lstm(dec_output)

        attention_weights['decoder_layer1_block1']=self.max_pool_2d(attention_weights['decoder_layer1_block1'])

        final_output = self.final_layer(dec_output)
        
        return final_output, attention_weights,op



#  **SET HYPERPARAMETERS**

In [15]:
# hyper-params
num_layers = 4
d_model = 128  
dff = 512
num_heads = 8
EPOCHS = 20 

# **OPTIMIZER**

In [16]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')



# **LOSS AND METRIC**

In [17]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')


In [18]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
    return enc_padding_mask, combined_mask, dec_padding_mask


# **LOADING DATASET**

In [19]:
%env DATA_DIR=./data/squad 

# downloading the SQuAD dataset
def download_squad(version=1):
    if version == 1:
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
    else:
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
        !wget -P $DATA_DIR https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
            
download_squad(version=2)


env: DATA_DIR=./data/squad
--2022-10-05 08:36:09--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘./data/squad/train-v2.0.json’


2022-10-05 08:36:09 (280 MB/s) - ‘./data/squad/train-v2.0.json’ saved [42123633/42123633]

--2022-10-05 08:36:09--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘./data/squad/dev-v2.0.json’


2022-10-05 08:36:09 (279 MB/s) - ‘

# **PROCESSOR**

In [20]:
from transformers.data.processors.squad import SquadV2Processor

In [21]:
# this processor loads the SQuAD2.0 dev set examples
# The processors can be used for loading datasets and converting their examples to features for direct use in the model.
processor = SquadV2Processor()
examples = processor.get_dev_examples("./data/squad/", filename="dev-v2.0.json")
print("validation "+str(len(examples)))
#/content/data/squad/train-v2.0.json
examplestrain = processor.get_train_examples("./data/squad/", filename="train-v2.0.json")
print("train "+str(len(examplestrain)))

100%|██████████| 35/35 [00:06<00:00,  5.12it/s]


validation 11873


100%|██████████| 442/442 [01:10<00:00,  6.27it/s]

train 130319





In [22]:
# generate some maps to help us identify examples of interest
qid_to_example_index = {example.qas_id: i for i, example in enumerate(examples)}
qid_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if has_answer]
no_answer_qids = [qas_id for qas_id, has_answer in qid_to_has_answer.items() if not has_answer]

qid_to_example_index_train = {example.qas_id: i for i, example in enumerate(examplestrain)}
qid_to_has_answer_train = {example.qas_id: bool(example.answers) for example in examplestrain}
answer_qids_train = [qas_id for qas_id, has_answer in qid_to_has_answer_train.items() if has_answer]   #changed by me 
no_answer_qids_train = [qas_id for qas_id, has_answer in qid_to_has_answer_train.items() if not has_answer]

In [23]:
#It dsplays the Question and Context of a sample from dataset
def extract(idx, train = True):    
    from pprint import pprint
    a = ""
    if train:
      # idx = qid_to_example_index_train[qid]
      q = examplestrain[idx].question_text
      c = examplestrain[idx].context_text
      for answer in examplestrain[idx].answers:
        a = answer['text']
    else:
      # idx = qid_to_example_index[qid]
      q = examples[idx].question_text
      c = examples[idx].context_text
      for answer in examples[idx].answers:
        a = answer['text']
    qna = "<CLS>" + q + "<SEP>" + a
    return c[:100], qna

In [24]:
def generate_dataset():
  context_train = []
  qna_train = []
  context_test = []
  qna_test = []
  for i in range(len(examplestrain)):
    c,qna = extract(i,train=True)
    context_train.append(c)
    qna_train.append(qna)
  for i in range(len(examples)):
    c,qna = extract(i,train=False)
    context_test.append(c)
    qna_test.append(qna)
  return context_train, qna_train, context_test, qna_test

In [25]:
context_train, qna_train, context_test, qna_test = generate_dataset()

# **TOKENIZATION**

In [26]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

In [27]:
context_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)
qna_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token)

In [28]:
context_tokenizer.fit_on_texts(context_train)
qna_tokenizer.fit_on_texts(qna_train)

In [29]:
samples = 10000
inputs = context_tokenizer.texts_to_sequences(context_train[:samples])
targets = qna_tokenizer.texts_to_sequences(qna_train[:samples])

In [30]:
encoder_vocab_size = len(context_tokenizer.word_index) + 1
decoder_vocab_size = len(qna_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

(31525, 48975)

In [31]:
# maxlen
# taking values > and round figured to 75th percentile
# at the same time not leaving high variance
encoder_maxlen = 100
decoder_maxlen = 75

In [32]:
transformer = Transformer(
    num_layers, 
    d_model, 
    num_heads, 
    dff,
    encoder_vocab_size, 
    decoder_vocab_size, 
    pe_input=encoder_vocab_size, 
    pe_target=decoder_vocab_size,
)

# **CHECKPOINTS**

In [33]:
path = "/content/drive/MyDrive/QnA/"
checkpoint_path = path + "model/checkpoints/"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!!')

# **PADDING/TRUNCATING the sequence for identical sequence lengths**

In [34]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=encoder_maxlen, padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=decoder_maxlen, padding='post', truncating='post')

# **CREATE DATASET PIPELINE**

In [35]:
inputs = tf.cast(inputs, dtype=tf.int32)
targets = tf.cast(targets, dtype=tf.int32)

In [36]:
BUFFER_SIZE = 4000 #20000 the maximum number elements that will be buffered when prefetching.
BATCH_SIZE = 64

In [37]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# **TRAINING STEPS**

In [38]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions,_,_ = transformer(
            inp, tar_inp, 
            True, 
            enc_padding_mask, 
            combined_mask, 
            dec_padding_mask
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

In [41]:
import warnings
warnings.filterwarnings("ignore")
def warn(*args, **kwargs):
    pass
warnings.warn = warn

EPOCHS = 2
for epoch in range(EPOCHS):
  start = time.time()
  train_loss.reset_states()
  for (batch, (inp, tar)) in enumerate(dataset):
      train_step(inp, tar)
  if (epoch + 1) % 10 == 0:
      ckpt_save_path = ckpt_manager.save()
      print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))
  print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))
  print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))



KeyboardInterrupt: ignored

# **PREDICTION**

In [None]:
model_dir = "/content/drive/MyDrive/QnA-20210324T144643Z-001/QnA/model/"

In [None]:
# Loading our trained model
with open(model_dir + 'model.pkl', 'rb') as fp:
    model = pickle.load(fp)
with open(model_dir + 'tokenizer.pkl', 'rb') as fp:
    tokenizer = pickle.load(fp)

In [None]:
# ----------------- Helper functions for get_robust_prediction ----------------- #
def to_list(tensor):
  return tensor.detach().cpu().tolist()

def get_qa_inputs(example, tokenizer):
    # load the example, convert to inputs, get model outputs
    question = example.question_text
    context = example.context_text
    return tokenizer.encode_plus(question, context, return_tensors='pt')

def get_clean_text(tokens, tokenizer):
    text = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(tokens)
        )
    # Clean whitespace
    text = text.strip()
    text = " ".join(text.split())
    return text

# compute the probability of each prediction - nice but not necessary
def prediction_probabilities(predictions):

    def softmax(x):
        """Compute softmax values for each sets of scores in x."""
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum()

    all_scores = [pred.start_logit+pred.end_logit for pred in predictions] 
    return softmax(np.array(all_scores))


# get sensible preliminary predictions, sorted by score
def preliminary_predictions(start_logits, end_logits, input_ids, nbest):
    # convert tensors to lists
    start_logits = to_list(start_logits)[0]
    end_logits = to_list(end_logits)[0]
    tokens = to_list(input_ids)[0]

    # sort our start and end logits from largest to smallest, keeping track of the index
    start_idx_and_logit = sorted(enumerate(start_logits), key=lambda x: x[1], reverse=True)
    end_idx_and_logit = sorted(enumerate(end_logits), key=lambda x: x[1], reverse=True)
    
    start_indexes = [idx for idx, logit in start_idx_and_logit[:nbest]]
    end_indexes = [idx for idx, logit in end_idx_and_logit[:nbest]]

    # question tokens are between the CLS token (101, at position 0) and first SEP (102) token 
    question_indexes = [i+1 for i, token in enumerate(tokens[1:tokens.index(102)])]

    # keep track of all preliminary predictions
    PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
        "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
    )
    prelim_preds = []
    for start_index in start_indexes:
        for end_index in end_indexes:
            # throw out invalid predictions
            if start_index in question_indexes:
                continue
            if end_index in question_indexes:
                continue
            if end_index < start_index:
                continue
            prelim_preds.append(
                PrelimPrediction(
                    start_index = start_index,
                    end_index = end_index,
                    start_logit = start_logits[start_index],
                    end_logit = end_logits[end_index]
                )
            )
    # sort prelim_preds in descending score order
    prelim_preds = sorted(prelim_preds, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
    return prelim_preds

# narrow that down to the top nbest predictions
def best_predictions(prelim_preds, nbest, tokenizer, tokens, start_logits, end_logits):
    # keep track of all best predictions

    # This will be the pool from which answer probabilities are computed 
    BestPrediction = collections.namedtuple(
        "BestPrediction", ["text", "start_logit", "end_logit"]
    )
    nbest_predictions = []
    seen_predictions = []
    for pred in prelim_preds:
        if len(nbest_predictions) >= nbest: 
            break
        if pred.start_index > 0: # non-null answers have start_index > 0

            toks = tokens[pred.start_index : pred.end_index+1]
            text = get_clean_text(toks, tokenizer)
            # print(f'Text is :{text}')

            # if this text has been seen already - skip it
            if text in seen_predictions:
                continue

            # flag text as being seen
            seen_predictions.append(text) 

            # add this text to a pruned list of the top nbest predictions
            nbest_predictions.append(
                BestPrediction(
                    text=text, 
                    start_logit=pred.start_logit,
                    end_logit=pred.end_logit
                    )
                )
        
    # Add the null prediction
    nbest_predictions.append(
        BestPrediction(
            text="", 
            start_logit=start_logits[0], 
            end_logit=end_logits[0]
            )
        )
    return nbest_predictions

# compute score difference
def compute_score_difference(predictions):
    """ Assumes that the null answer is always the last prediction """
    score_null = predictions[-1].start_logit + predictions[-1].end_logit
    score_non_null = predictions[0].start_logit + predictions[0].end_logit
    return score_null - score_non_null

In [None]:
# Inference on given quesion and context
@anvil.server.callable
def get_robust_prediction_qna(question, context,  nbest=10, null_threshold=1.0, verbose=False):
    
    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    # pprint(inputs)
    outputs = model(**inputs)
    # pprint(outputs)
    start_logits, end_logits = outputs.start_logits, outputs.end_logits

    # pprint(tokens)
    # get sensible preliminary predictions, sorted by score
    prelim_preds = preliminary_predictions(start_logits, 
                                           end_logits, 
                                           inputs['input_ids'],
                                           nbest)
    # pprint(prelim_preds)
    # narrow that down to the top nbest predictions
    tokens = to_list(inputs['input_ids'])[0]
    start_logits = to_list(start_logits)[0]
    end_logits = to_list(end_logits)[0]
    nbest_preds = best_predictions(prelim_preds, nbest, tokenizer, tokens, start_logits, end_logits)
    # pprint(nbest_preds)
    # compute the probability of each prediction - nice but not necessary
    probabilities = prediction_probabilities(nbest_preds)
        
    # compute score difference
    score_difference = compute_score_difference(nbest_preds)

    # return nbest_preds, probabilities
    # if score difference > threshold, return the null answer
    if score_difference > null_threshold:
        return "", probabilities[-1]
    else:
      if verbose:
        prob_answer = {}
        for i,p in enumerate(probabilities):
          prob_answer[p] = nbest_preds[i].text
        return prob_answer
      else:
        return nbest_preds[0].text, probabilities[0]

In [None]:
anvil.server.wait_forever()

In [None]:
q='what is your favourite place?'
c='i hate raspberry'
get_robust_prediction_qna(q,c)

In [None]:
q = "When did lincoln born?"
c = "Abraham Lincoln was born on February 12, 1809, in Hardin County, Kentucky, to Thomas and Nancy Lincoln in their one room log cabin on their farm for a living known as Sinking Spring (near modern-day Hodgenville, Kentucky). Although Thomas lacked formal education, he was an excellent farmer and carpenter, and often times served as a member of the jury. Thomas and Nancy joined a small Baptist church in the area that had broken away from the larger church over the issue of slavery."
#View only the top prediction when verbose = False by default
get_robust_prediction_qna(q,c)

In [None]:
#View all the nbest outputs with thier probability when verbose = True
get_robust_prediction_qna(q,c, verbose=True)

In [None]:
q = "where is the church?"
c = "The Candelária Church is a famous historic Roman Catholic church in central Rio de Janeiro, Brazil. The church itself and the buildings around it in Pius X Square became known as a popular location for possibly hundreds of Rio de Janeiro's street children to form a makeshift home at night. The church's personnel provides food, shelter, education and religious advice to as many of these children as possible. Many of the homeless children are involved with the illegal drug trade and prostitution, and because many of these children also live around the church during the day, police keep a constant watch on the church's surroundings. "
get_robust_prediction_qna(q,c)

In [None]:
q = "who won the match?"
c = "With the wicket of Pat Cummins, Joe Root (Eng) took his 12th catch of the tournament, surpassing Ricky Ponting's record of 11 he set in 2003. This was Australia's first World Cup semi-final defeat in eight appearances, england won the match."
get_robust_prediction_qna(q,c)

In [None]:
q = "what is his nationality?"
c = " his mother tongue is german "
get_robust_prediction_qna(q,c)

# METRICS

In [None]:
import torch
# given a question id (qas_id or qid), load the example, get the model outputs and generate an answer
def get_prediction(qid):
    
    question = examples[qid_to_example_index[qid]].question_text
    context = examples[qid_to_example_index[qid]].context_text

    inputs = tokenizer.encode_plus(question, context, return_tensors='pt')
    outputs = model(**inputs)
    
    answer_start = torch.argmax(outputs[0])  # get the most likely beginning of answer with the argmax of the score
    answer_end = torch.argmax(outputs[1]) + 1 

    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

    return answer

#Removing articles and punctuation, and standardizing whitespace are all typical text processing steps.
def normalize_text(s):    
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# check if the prediction and truth are extacly matching
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

# compute f1 score using precision and recal based on number of common tokens in prediction and truth,
# and also number of predicte and truth tokens
def F1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    a = len(common_tokens) / len(pred_tokens) #precision
    b = len(common_tokens) / len(truth_tokens) #recall
    
    return  2*(a * b) / (a + b)

# helper function that retrieves all possible true answers from a squad2.0 example
def get_gold_answers(example):
    gold_answers = [answer["text"] for answer in example.answers if answer["text"]]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = [""]
        
    return gold_answers

#POSITIVE EXAMPLE-----
prediction = get_prediction(answer_qids[100])
example = examples[qid_to_example_index[answer_qids[100]]]

gold_answers = get_gold_answers(example)

em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
f1_score = max((F1(prediction, answer)) for answer in gold_answers)

print(f"Question: {example.question_text}")
print(f"Prediction: {prediction}")
print(f"True Answers: {gold_answers}")
print(f"EM: {em_score} \t F1: {f1_score}")




In [None]:
print(len(answer_qids))
print(len(answer_qids_train))

In [None]:
print(len(answer_qids))

In [None]:
total_em=0
total_f1=0
em_list=[]
f1_list=[]
val=1500
for i in range(1,val+1):
  total_em+=max((compute_exact_match(get_prediction(answer_qids[i]), answer )) for answer in get_gold_answers(examples[qid_to_example_index[answer_qids[i]]]) )
  total_f1+=max((F1(get_prediction(answer_qids[i]), answer)) for answer in get_gold_answers(examples[qid_to_example_index[answer_qids[i]]]) )
  if(i%50==0):
    print(str(i)+' values: '+str(total_em/i)+' '+str(total_f1/i))
    print()




In [None]:
itr=1000
a=0
#itr = len(answer_qids)
for i in range(1000):  
    prediction = get_prediction(answer_qids[i][:512])
    example = examples[qid_to_example_index[answer_qids[i]]]
    gold_answers = get_gold_answers(example)
    em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
    a+=em_score
print(a/itr)