In [2]:
from transformers import AutoTokenizer
import pandas as pd
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [4]:
df = pd.read_csv('../data/emrqa_val.csv')
samples = df.sample(5).reset_index(drop=True)

In [11]:
inputs = tokenizer(samples['question'].tolist(), samples['evidence'].tolist(), truncation='only_second',
                   stride=50, return_overflowing_tokens=True, return_offsets_mapping=True)
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])

In [26]:
start_positions = []
end_positions = []
for i, offset in enumerate(inputs['offset_mapping']):
    question = samples['question'][i]
    answer = samples['answer'][i]
    evidence = samples['evidence'][i].lower()
    calc_answer_start = evidence.find(answer)
    end_char = calc_answer_start + len(answer)
    sequence_ids = inputs.sequence_ids(i)
    
    idx = 0
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] == 1:
        idx += 1
    context_end = idx - 1

    idx = context_start
    while idx <= context_end and offset[idx][0] <= calc_answer_start:
        idx += 1
    start_positions.append(idx - 1)

    idx = context_end
    while idx >= context_start and offset[idx][1] >= end_char:
        idx -= 1
    end_positions.append(idx + 1)

start_positions, end_positions

([20, 20, 12, 33, 43], [24, 25, 14, 36, 43])

In [33]:
for i in range(len(samples)):
    sample_idx = inputs["overflow_to_sample_mapping"][i]
    answer = samples['answer'][i]
    
    start = start_positions[i]
    end = end_positions[i]
    labeled_answer = tokenizer.decode(inputs["input_ids"][i][start : end + 1])
    print(f"Theoretical answer: {answer}, labels give: {labeled_answer}")

Theoretical answer: her electrocardiogram, labels give: her electrocardiogram
Theoretical answer: s/p lad ptca, labels give: s / p lad ptca
Theoretical answer: 160/80, labels give: 160 / 80
Theoretical answer: colonoscopy, labels give: colonoscopy
Theoretical answer: vt, labels give: vt


In [34]:
inputs = tokenizer(question, evidence, truncation='only_second',
                   stride=50, return_overflowing_tokens=True, return_offsets_mapping=True)

In [36]:
inputs.sequence_ids()

[None,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 None,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 None]

In [37]:
offset = inputs['offset_mapping']
offset

[[(0, 0),
  (0, 4),
  (5, 8),
  (9, 12),
  (13, 23),
  (24, 27),
  (28, 30),
  (31, 38),
  (38, 39),
  (39, 40),
  (41, 43),
  (43, 49),
  (0, 0),
  (0, 1),
  (1, 2),
  (3, 5),
  (6, 7),
  (8, 9),
  (10, 11),
  (12, 20),
  (21, 24),
  (24, 25),
  (26, 28),
  (29, 33),
  (34, 38),
  (39, 46),
  (47, 49),
  (50, 52),
  (53, 54),
  (55, 57),
  (57, 61),
  (61, 64),
  (65, 66),
  (66, 69),
  (69, 70),
  (71, 72),
  (73, 81),
  (82, 95),
  (96, 102),
  (103, 104),
  (104, 107),
  (108, 116),
  (117, 119),
  (120, 122),
  (123, 124),
  (125, 127),
  (128, 129),
  (130, 131),
  (132, 133),
  (134, 135),
  (136, 138),
  (139, 146),
  (147, 150),
  (151, 158),
  (159, 161),
  (162, 169),
  (170, 172),
  (173, 175),
  (175, 181),
  (182, 183),
  (184, 185),
  (186, 187),
  (188, 195),
  (196, 202),
  (203, 211),
  (212, 215),
  (216, 219),
  (220, 228),
  (229, 231),
  (231, 234),
  (234, 237),
  (238, 239),
  (240, 243),
  (243, 246),
  (247, 248),
  (249, 252),
  (253, 254),
  (255, 259),
  (2