<a href="https://colab.research.google.com/github/pokjay/heb-squad/blob/main/XLM_R_zero_shot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Aug 30 20:14:49 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## Connect to Google Drive

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
!git clone https://github.com/pokjay/heb-squad

fatal: destination path 'heb-squad' already exists and is not an empty directory.


In [None]:
!gzip -d /content/heb-squad/data/final/heb-dev-v2.0.csv.gz

gzip: /content/heb-squad/data/final/heb-dev-v2.0.csv.gz: No such file or directory


In [None]:
pip install transformers



In [None]:
!pip install sentencepiece



In [None]:
import collections
import pandas as pd
from tqdm.notebook import trange, tqdm

import torch
from torch.utils.data import DataLoader

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

In [None]:
BATCH_SIZE = 64

In [None]:
val_df = pd.read_csv('/content/heb-squad/data/final/heb-dev-v2.0.csv')

In [None]:
val_df = val_df[(val_df.answer_start_heb > 0) & (val_df.answer_end_heb > 0)]

Remove answer for impossible questions (The answer is actually the plausible answer)

In [None]:
val_df['plausible_answer'] = val_df['answer']
val_df['plausible_answer_start_heb'] = val_df['answer_start_heb']
val_df['plausible_answer_end_heb'] = val_df['answer_end_heb']

In [None]:
val_df.loc[val_df.is_impossible == 1, 'answer'] = ''
val_df.loc[val_df.is_impossible == 1, 'answer_start_heb'] = 0
val_df.loc[val_df.is_impossible == 1, 'answer_end_heb'] = 0

In [None]:
val_df.sample(1)

Unnamed: 0,id,context,question,answer,answer_start,answer_end,is_impossible,article,context_marked,answer_start_heb,answer_end_heb,plausible_answer,plausible_answer_start_heb,plausible_answer_end_heb
18454,572f65e9b2c2fd14005680cf,הריין הוא הנהר הארוך ביותר בגרמניה. כאן נתקל ה...,מהו הרוחב הממוצע של הריין?,400 מ ',587,592,0,Rhine,הריין הוא הנהר הארוך ביותר בגרמניה. כאן נתקל ה...,429,436,400 מ ',429,436


## Transform the texts to encodings

In [None]:
val_ids = val_df.id.to_list()
val_is_impossible = val_df.is_impossible.to_list()
val_contexts = val_df.context.to_list()
val_questions = val_df.question.to_list()

In [None]:
val_answers = val_df.apply(lambda x: {'answer_start' : x.answer_start_heb, 'answer_end': x.answer_end_heb}, axis=1).to_list()

In [None]:
from transformers import XLMRobertaTokenizerFast
tokenizer = XLMRobertaTokenizerFast.from_pretrained('deepset/xlm-roberta-base-squad2')

val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        if answers[i]['answer_end'] > 0:
          end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        else:
          end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(val_encodings, val_answers)

In [None]:
val_encodings.update({'id': val_ids})
val_encodings.update({'is_impossible': val_is_impossible})

## Create PyTorch dataset

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        # Get all encoded vals as tensors
        vals = {key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key not in ('id', 'is_impossible')}
        # Add the id which is a string, used to map predictions to ids later
        vals.update({key: val[idx] for key, val in self.encodings.items() if key in ('id', 'is_impossible')})
        return vals

    def __len__(self):
        return len(self.encodings.input_ids)

val_dataset = SquadDataset(val_encodings)

In [None]:
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

## Import Hebrew BERT for Q&A fine tuning

In [None]:
from transformers import XLMRobertaForQuestionAnswering
model = XLMRobertaForQuestionAnswering.from_pretrained("deepset/xlm-roberta-base-squad2")

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [None]:
def load_checkpoint(path, model):

  # Load checkpoint from Google Drive
  checkpoint = torch.load(path)

  # Load checkpoint to model and optimizer
  model.load_state_dict(checkpoint['state_dict'])
  checkpoint_epoch = checkpoint['epoch']

  return checkpoint_epoch

## Code to evaluate model

In [None]:
def calc_metrics(model_inf, dataloader):
  """
  Given a model and a dataset calculate the following metrics:
  - Exact Match
  - F1 Score
  - Loss
  @returns (F1, EM, Loss, EM Scores, F1 Scores)
  """

  model.to(device)
  model.eval()

  f1_scores = {0: {}, 1: {}}
  exact_scores = {0: {}, 1: {}}
  total_cnts = {0: 0, 1: 0}

  loss = 0
  total_cnt = 0

  for counter, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
    with torch.no_grad():

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      ids = batch['id']
      is_impossibles = batch['is_impossible']
      outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
      loss += outputs[0]

      pred_start_positions = torch.argmax(outputs['start_logits'], axis=1)
      pred_end_positions = torch.argmax(outputs['end_logits'], axis=1)

      # Calc scores
      for i in range(len(input_ids)):

        is_impossible = int(is_impossibles[i])

        total_cnts[is_impossible] += 1
        
        total_cnt += 1

        # Check if we have an exact match
        if start_positions[i] == pred_start_positions[i] and end_positions[i] == pred_end_positions[i]:
          # We have an exact match, mark it
          exact_scores[is_impossible][ids[i]] = 1
        else:
          # No match, mark with 0 or keep previous marking
          exact_scores[is_impossible][ids[i]] = max(0, exact_scores[is_impossible].get(ids[i], 0))

        # Get the predicted answer token sequence
        pred_tokens = input_ids[i][pred_start_positions[i]:pred_end_positions[i] + 1].tolist()
        answer_tokens = input_ids[i][start_positions[i]:end_positions[i] + 1].tolist()

        # Check how many predicted tokens correspond to gold answer tokens
        common = collections.Counter(answer_tokens) & collections.Counter(pred_tokens)
        num_same = sum(common.values())

        # If none same, then F1=0 , else use the F1 formula
        if num_same == 0:
          score = 0
        else:
          precision = 1.0 * num_same / len(pred_tokens)
          recall = 1.0 * num_same / len(answer_tokens)
          score = (2 * precision * recall) / (precision + recall)
        
        # We take the max F1 score of the gold answers
        f1_scores[is_impossible][ids[i]] = max(score, f1_scores[is_impossible].get(ids[i], 0))

  f1_joined = {**f1_scores[0], **f1_scores[1]}
  em_joined = {**exact_scores[0], **exact_scores[1]}

  f1 = sum(f1_joined.values()) / len(f1_joined)
  em = sum(em_joined.values()) / len(em_joined)

  f1_split = {0: round(sum(f1_scores[0].values()) / len(f1_scores[0]), 4),
              1: round(sum(f1_scores[1].values()) / len(f1_scores[1]), 4),
              'all': round(f1, 4)}

  em_split = {0: round(sum(exact_scores[0].values()) / len(exact_scores[0]), 4),
              1: round(sum(exact_scores[1].values()) / len(exact_scores[1]), 4),
              'all': round(em, 4)}

  epoch_avg_loss = loss / len(dataloader)

  metrics = {'f1': f1_split,
             'em': em_split,
             'loss': epoch_avg_loss}

  return metrics

## Evaluate given checkpoint

In [None]:
model.to(device)

metrics = calc_metrics(model, val_loader)

print(metrics)

  0%|          | 0/354 [00:00<?, ?it/s]

{'f1': {0: 0.0266, 1: 0.0008, 'all': 0.0142}, 'em': {0: 0.0195, 1: 0.0, 'all': 0.0101}, 'loss': tensor(6.4845, device='cuda:0')}
