In [14]:
import json
import random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Subset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from tqdm import tqdm

In [15]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()

In [16]:
%%time
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']
CPU times: user 15.6 ms, sys: 0 ns, total: 15.6 ms
Wall time: 24.6 ms


In [17]:
train = pd.read_parquet("input/squad/train.parquet")
train.drop(index=train[train["id"] == "5acd29f507355d001abf3774"].index, inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130318 entries, 0 to 130318
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             130318 non-null  object
 1   is_impossible  130318 non-null  int8  
 2   question       130318 non-null  object
 3   answer_start   130318 non-null  int16 
 4   answer_end     130318 non-null  int16 
 5   answer_text    130318 non-null  object
 6   context        130318 non-null  object
dtypes: int16(2), int8(1), object(4)
memory usage: 5.6+ MB


In [18]:
%%time
enc = tokenizer(list(train["context"]), list(train["question"]))
print(f"{repr(enc.keys())}\nlen={len(enc['input_ids'])}")

Token indices sequence length is longer than the specified maximum sequence length for this model (518 > 512). Running this sequence through the model will result in indexing errors


dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=130318
CPU times: user 1min 1s, sys: 125 ms, total: 1min 1s
Wall time: 19.7 s


In [19]:
%%time
indices = []
for i, v in enumerate(enc["input_ids"]):
    if len(v) <= model_max_length:
        indices.append(i)
train = train.iloc[indices]
enc = tokenizer(list(train["context"]), list(train["question"]), padding="max_length", return_offsets_mapping=True)
print(f"{repr(enc.keys())}\nlen={len(enc['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'])
len=130134
CPU times: user 1min 16s, sys: 39.7 s, total: 1min 55s
Wall time: 1min


In [20]:
def add_token_positions(encodings, answer_start, answer_end, ids, is_impossible):
    start_positions = []
    end_positions = []
    for i in range(len(is_impossible)):
        j, k = 0, 0
        if is_impossible[i] == 0:
            j = encodings.char_to_token(i, answer_start[i])
            if j is None:
                offsets = encodings["offset_mapping"][i]
                _id = ids[i]
                raise ValueError(f"start pos must not be None. i={i}, id={_id}, answer_start={answer_start[i]}\noffsets={offsets}") 
            k = encodings.char_to_token(i, answer_end[i] - 1)
            if k is None:
                raise ValueError("end pos must not be None")
            if j > k:
                raise ValueError("start pos must be less than or equals end pos")
        start_positions.append(j)
        end_positions.append(k)
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [21]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


In [22]:
%%time
add_token_positions(
    enc, 
    answer_start=list(train["answer_start"]), 
    answer_end=list(train["answer_end"]),
    ids=list(train["id"]),
    is_impossible=list(train["is_impossible"]),
)
train_ds = SquadDataset(enc)
del enc

CPU times: user 219 ms, sys: 0 ns, total: 219 ms
Wall time: 211 ms


In [23]:
%%time
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_dir)
print(repr(model.config))

Some weights of the model checkpoint at pretrained/google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at pretrained/google/electra-small-discriminator and are newly initialized: ['qa_outpu

ElectraConfig {
  "_name_or_path": "pretrained/google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

CPU times: user 391 ms, sys: 141 ms, total: 531 ms
Wall time: 459 ms


In [24]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
max_examples = int(0.1 * len(train_ds))
indices = range(len(train_ds))
indices = random.sample(indices, max_examples)
sample_ds = Subset(train_ds, indices)
train_loader = DataLoader(sample_ds, batch_size=128, shuffle=True)
#train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
optim = AdamW(model.parameters(), lr=5e-4)
epochs = 2
for epoch in range(epochs):
    loss_mean = 0
    steps = len(train_loader)
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss_mean += loss / steps
        loss.backward()
        optim.step()
    print(f"epoch={epoch}, loss={loss_mean:.4f}")

100%|██████████| 102/102 [5:33:00<00:00, 195.89s/it] 
  0%|          | 0/102 [00:00<?, ?it/s]

epoch=0, loss=2.7604


100%|██████████| 102/102 [5:31:52<00:00, 195.22s/it] 

epoch=1, loss=1.6722





In [25]:
model.eval()

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_featu

In [26]:
model.save_pretrained("tmp")

In [27]:
%%time
model = AutoModelForQuestionAnswering.from_pretrained("tmp")
print(repr(model.config))

ElectraConfig {
  "_name_or_path": "tmp",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

CPU times: user 344 ms, sys: 15.6 ms, total: 359 ms
Wall time: 294 ms


In [28]:
df = train.sample(30)
questions = list(df["question"])
contexts = list(df["context"])
golds = list(df["answer_text"])
is_impossible = list(df["is_impossible"])

In [30]:
inputs = tokenizer(contexts, questions, truncation="only_first", padding="max_length", return_tensors="pt")
input_ids = inputs["input_ids"]
start_logits, end_logits = model(**inputs).values()
for i in range(len(start_logits)):    
    j = torch.argmax(start_logits[i])  
    k = torch.argmax(end_logits[i]) + 1
    a = "IMPOSSIBLE"
    if j < k:
        tokens = tokenizer.convert_ids_to_tokens(input_ids[i][j:k])
        a = tokenizer.convert_tokens_to_string(tokens)
    print(f"\n\nis_impossible={is_impossible[i]}\nq={questions[i]}\nc={contexts[i]}\ni={i}, j={j}, k={k}\na={a}\ng={golds[i]}")



is_impossible=0
q=In what year were all Tibetan Muslims declared Indiana citizens?
c=Muslims have been living in Tibet since as early as the 8th or 9th century. In Tibetan cities, there are small communities of Muslims, known as Kachee (Kache), who trace their origin to immigrants from three main regions: Kashmir (Kachee Yul in ancient Tibetan), Ladakh and the Central Asian Turkic countries. Islamic influence in Tibet also came from Persia. After 1959 a group of Tibetan Muslims made a case for Indian nationality based on their historic roots to Kashmir and the Indian government declared all Tibetan Muslims Indian citizens later on that year. Other Muslim ethnic groups who have long inhabited Tibet include Hui, Salar, Dongxiang and Bonan. There is also a well established Chinese Muslim community (gya kachee), which traces its ancestry back to the Hui ethnic group of China.
i=0, j=78, k=79
a=1959
g=1959


is_impossible=0
q=What did Apple's creation of too many similar models do to pote