In [1]:
import json
import gc
import random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Subset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from tqdm import tqdm

In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [3]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [4]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_dir)
print(repr(model.config))

Some weights of the model checkpoint at pretrained/google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at pretrained/google/electra-small-discriminator and are newly initialized: ['qa_outpu

ElectraConfig {
  "_name_or_path": "pretrained/google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [5]:
train = pd.read_parquet("input/train.parquet")
train = train.sample(frac=0.06)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21600 entries, 191264 to 147282
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Id                    21600 non-null  object
 1   is_multi              21600 non-null  int8  
 2   ground_truth          21600 non-null  object
 3   dataset_labels        21600 non-null  object
 4   is_impossible         21600 non-null  int8  
 5   answer_start          21600 non-null  int16 
 6   answer_end            21600 non-null  int16 
 7   context               21600 non-null  object
 8   context_token_length  21600 non-null  int16 
dtypes: int16(3), int8(2), object(4)
memory usage: 1012.5+ KB


In [6]:
%%time
question = "what dataset"
questions = [question] * len(train)
enc = tokenizer(list(train["context"]), questions, padding="max_length")
print(f"{repr(enc.keys())}\nlen={len(enc['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=21600
CPU times: user 20.5 s, sys: 2.56 s, total: 23.1 s
Wall time: 5.67 s


In [7]:
def add_token_positions(encodings, answer_start, answer_end, ids, is_impossible):
    start_positions = []
    end_positions = []
    for i in range(len(is_impossible)):
        j, k = 0, 0
        if is_impossible[i] == 0:
            j = encodings.char_to_token(i, answer_start[i])
            if j is None:
                #offsets = encodings["offset_mapping"][i]
                _id = ids[i]
                raise ValueError(f"start pos must not be None. i={i}, id={_id}, answer_start={answer_start[i]}")  #offsets={offsets}") 
            k = encodings.char_to_token(i, answer_end[i] - 1)
            if k is None:
                raise ValueError("end pos must not be None")
            if j > k:
                raise ValueError("start pos must be less than or equals end pos")
        start_positions.append(j)
        end_positions.append(k)
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [8]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
%%time
add_token_positions(
    enc, 
    answer_start=list(train["answer_start"]), 
    answer_end=list(train["answer_end"]),
    ids=list(train["Id"]),
    is_impossible=list(train["is_impossible"]),
)
train_ds = MyDataset(enc)

CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 36.1 ms


In [10]:
del enc, questions
gc.collect()

80

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
#lr=5e-4
optim = AdamW(model.parameters(), lr=5e-4)
epochs = 1
for epoch in range(epochs):
    loss_mean = 0
    steps = len(train_loader)
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss_mean += loss / steps
        loss.backward()
        optim.step()
    print(f"epoch={epoch}, loss={loss_mean:.4f}")

  1%|          | 1/169 [03:16<9:11:21, 196.91s/it]

In [None]:
model.eval()

In [None]:
model.save_pretrained("output")

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("output")
print(repr(model.config))

In [None]:
df = train.sample(30)
questions = [question] * 30
contexts = list(df["context"])
golds = list(df["answer_text"])
is_impossible = list(df["is_impossible"])

In [None]:
inputs = tokenizer(contexts, questions, truncation="only_first", padding="max_length", return_tensors="pt")
input_ids = inputs["input_ids"]
start_logits, end_logits = model(**inputs).values()
for i in range(len(start_logits)):    
    j = torch.argmax(start_logits[i])  
    k = torch.argmax(end_logits[i]) + 1
    a = "IMPOSSIBLE"
    if 0 < j < k:
        tokens = tokenizer.convert_ids_to_tokens(input_ids[i][j:k])
        a = tokenizer.convert_tokens_to_string(tokens)
    print(f"\n\nis_impossible={is_impossible[i]}\nq={questions[i]}\nc={contexts[i]}\ni={i}, j={j}, k={k}\na={a}\ng={golds[i]}")