In [1]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, Trainer, BertTokenizerFast
import torch
from datasets import Dataset
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
import pickle
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "Seznam/small-e-czech"
max_length = 384  # 384 for BERT-like 382 for T5 models
doc_stride = 128
n_best_answers = 20

#### Load tokenizer and model 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("ELECTRA_czech_finetuned_sqad\\electra-czech-finetuned-sqad", local_files_only=True)
model

In [5]:
device = torch.device("cpu")
model.to(device)

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_featu

#### Add answer sentence to sqad dataset for Answer Selection evaluation

In [2]:
sqad_csv = pd.read_csv("sqad_dataframe.csv")

In [3]:
sqad_csv = sqad_csv.rename(columns={'Unnamed: 0': 'id'})

In [74]:
test_df = pd.read_json("czech_test.json")

In [10]:
merged = pd.merge(test_df, sqad_csv, on="id")

In [11]:
merged = merged.rename(columns={"question_y":"question"})

In [12]:
merged = merged.drop(columns=["question_x","text","answer"])

In [14]:
def get_answer_sentence_span(row):
    start_index = row.context.index(row.answer_sentence)
    end_index = start_index + len(row.answer_sentence)
    return start_index, end_index

In [15]:
row = merged.iloc[2]
answer_span = get_answer_sentence_span(row)
answer_span
print(row.context[answer_span[0]:answer_span[1]])
print(row.answer_sentence)

První buzolu sestrojil český vynálezce Josef Ressel.
První buzolu sestrojil český vynálezce Josef Ressel.


In [56]:
merged["answer_sentence_span"] = merged.apply(get_answer_sentence_span, axis=1)

In [80]:
merged

Unnamed: 0,answers,context,id,title,question,answer_sentence,answer_sentence_span
0,"{'answer_start': [12], 'text': ['levostranný']}",Dřevnice je levostranný přítok řeky Moravy ve ...,12242,Jaký přítok je Dřevnice?,Jaký přítok je Dřevnice?,Dřevnice je levostranný přítok řeky Moravy ve ...,"(0, 61)"
1,"{'answer_start': [0], 'text': 've Foxrocku'}",Samuel Barclay Beckett [Bekit] (13. dubna 1906...,2490,Kde se narodil Samuel Beckett?,Kde se narodil Samuel Beckett?,Samuel Barclay Beckett [Bekit] (13. dubna 1906...,"(0, 130)"
2,"{'answer_start': [216], 'text': ['Josef Ressel']}",Buzola (také busola) je jednoduchý přístroj pr...,5571,Který český vynálezce sestrojil první buzolu?,Který český vynálezce sestrojil první buzolu?,První buzolu sestrojil český vynálezce Josef R...,"(177, 229)"
3,"{'answer_start': [2084], 'text': ['1869']}","Deoxyribonukleová kyselina, běžně označovaná D...",7325,Ve kterém roce byla popsána deoxyribonukleová ...,Ve kterém roce byla popsána deoxyribonukleová ...,Deoxyribonukleová kyselina byla popsána roku 1...,"(2039, 2173)"
4,"{'answer_start': [168], 'text': ['strojové ins...","Centrální procesorová jednotka (zkratka CPU, a...",3770,Co vykonává procesor?,Co vykonává procesor?,"Centrální procesorová jednotka (zkratka CPU, a...","(0, 263)"
...,...,...,...,...,...,...,...
2484,"{'answer_start': [1137], 'text': ['na počátku ...","Gainax (japonsky ガ, Gainakkusu) je japonské an...",4817,Kdy bylo založeno studio Gainax?,Kdy bylo založeno studio Gainax?,Studio Gainax bylo založeno na počátku 80. let...,"(1109, 1278)"
2485,"{'answer_start': [830], 'text': ['Titan je šed...","Titan (chemická značka Ti, latinsky Titanium) ...",11707,Je titan lehký kov?,Je titan lehký kov?,"Titan je šedý až stříbřitě bílý, lehký a tvrdý...","(830, 881)"
2486,"{'answer_start': [17], 'text': ['10. března 19...",Eva Herzigová (* 10. března 1973 Litvínov) je ...,5771,Kdy se narodila Eva Herzigová?,Kdy se narodila Eva Herzigová?,Eva Herzigová (* 10. března 1973 Litvínov) je ...,"(0, 75)"
2487,"{'answer_start': [58], 'text': ['Vladimír Štan...","Michal David (* 14. července 1960 Praha), vlas...",10738,Jak se vlastním jménem jmenuje Michal David?,Jak se vlastním jménem jmenuje Michal David?,"Michal David (* 14. července 1960 Praha), vlas...","(0, 129)"


In [58]:
merged.to_json("czech_test_answer_sentence.json")

#### Tokenize row in a similar way to training tokenization

In [2]:
def tokenize_row(row):
    tokenized_row = tokenizer(
        row["question"],
        row["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_map = tokenized_row.pop("overflow_to_sample_mapping")
    tokenized_row["example_id"] = []
    for i in range(len(tokenized_row["input_ids"])):
        sample_idx = sample_map[i]
        tokenized_row["example_id"].append(row["id"][sample_idx])
        offset = tokenized_row["offset_mapping"][i]
        for k, v in enumerate(tokenized_row["offset_mapping"][i]):
            if tokenized_row.sequence_ids(i)[k] == 1:
                tokenized_row["offset_mapping"][i][k] = v
            else:
                tokenized_row["offset_mapping"][i][k] = None
    return tokenized_row

In [76]:
dataset_test = Dataset.from_pandas(test_df)
dataset_test = dataset_test.remove_columns(["__index_level_0__"])
dataset_test

Dataset({
    features: ['answers', 'context', 'id', 'question', 'title'],
    num_rows: 2489
})

In [None]:
tokenized_test =  dataset_test.map(tokenize_row, remove_columns=dataset_test.column_names)

In [11]:
tokenized_test

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'example_id'],
    num_rows: 2489
})

In [13]:
tokenized_test.features

{'input_ids': Sequence(feature=Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), length=-1, id=None),
 'offset_mapping': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
 'overflow_to_sample_mapping': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'example_id': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

In [87]:
tokenized_test[0].keys()

dict_keys(['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'example_id'])

In [95]:
from transformers import default_data_collator
data_collator = default_data_collator

#### Using Huggingface's TrainerAPI get raw predictions for each row in dataset

In [35]:
trainer = Trainer(
    model,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [36]:
preds = trainer.predict(tokenized_test)

The following columns in the test set don't have a corresponding argument in `XLMRobertaForQuestionAnswering.forward` and have been ignored: offset_mapping, example_id. If offset_mapping, example_id are not expected by `XLMRobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 35661
  Batch size = 8


In [43]:
preds

PredictionOutput(predictions=(array([[  0.44915432, -12.860064  , -13.58446   , ..., -12.721458  ,
        -13.542861  ,  -8.7459    ],
       [  6.869243  , -12.151521  , -12.601073  , ..., -11.126399  ,
        -12.660603  ,  -9.913016  ],
       [  6.720505  , -12.19447   , -12.7076025 , ..., -12.286344  ,
        -12.286344  , -12.286344  ],
       ...,
       [  6.298307  , -11.800813  , -12.75388   , ..., -13.617656  ,
        -13.32124   ,  -9.910314  ],
       [  6.5893917 , -11.885697  , -12.509168  , ..., -13.526317  ,
        -13.197158  , -10.013032  ],
       [  6.6067247 , -11.875133  , -12.425851  , ..., -13.412604  ,
        -13.412604  , -13.412604  ]], dtype=float32), array([[ -0.8379871, -13.17233  , -12.33597  , ..., -14.212399 ,
        -13.609648 ,  -8.457344 ],
       [  6.450125 , -13.804589 , -13.512745 , ..., -14.266769 ,
        -14.2044115,  -9.191486 ],
       [  6.32526  , -13.813894 , -13.527146 , ..., -13.66621  ,
        -13.66621  , -13.66621  ],
     

In [33]:
len(preds[0][0])

29713

In [80]:
from collections import OrderedDict, defaultdict
from tqdm import tqdm

#### Preprocess raw predictions from models and create top 20 answer predictions

In [40]:
def preprocess_predictions(dataset, tokenized_dataset, predictions):
    start_logits, end_logits = predictions
    predictions = OrderedDict()
    example_id_to_index = {k: i for i, k in enumerate(dataset["id"])}

    features_per_example = defaultdict(list)
    for i, feature in enumerate(tqdm(tokenized_dataset)):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
    #all of the questions   
    for example_index, example in enumerate(tqdm(dataset)):
        valid_answers = []
        #current question
        for input_index in features_per_example[example_index]:
            current_start_logits = start_logits[input_index]
            current_end_logits = end_logits[input_index]
            offset_mapping = tokenized_dataset[input_index]["offset_mapping"]
            start_indexes = np.argsort(current_start_logits)[-1 : -n_best_answers - 1 : -1].tolist()
            end_indexes = np.argsort(current_end_logits)[-1 : -n_best_answers - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if  start_index >= len(offset_mapping) \
                        or end_index >= len(offset_mapping) \
                        or offset_mapping[start_index] is None \
                        or offset_mapping[end_index] is None:
                        continue
                    if end_index < start_index:
                        continue
                    if start_index <= end_index:
                        start_char = offset_mapping[start_index][0]
                        end_char = offset_mapping[end_index][1]
                        valid_answers.append({"score": current_start_logits[start_index] + current_end_logits[end_index],
                                              "text": example["context"][start_char: end_char],
                                              "text_start": start_char})
                                            
        if len(valid_answers) > 0:
            best_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_answers]
        else:
            best_answer = {"text": "", "score": 0.0}
        predictions[example["id"]] = best_answers
        
    return predictions

In [41]:
prepd_predictions = preprocess_predictions(dataset_test, tokenized_test, preds.predictions)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 35661/35661 [00:49<00:00, 715.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2489/2489 [01:06<00:00, 37.50it/s]


In [None]:
prepd_predictions = dict(prepd_predictions)

In [None]:
with open('model_predictions\\electra_czech_predictions.json', 'wb') as fp:
    pickle.dump(prepd_predictions, fp)