In [1]:
import json
import gc
import random
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Subset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AdamW
from tqdm import tqdm
import scml

In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
tqdm.pandas()
scml.seed_everything()
seed = 31
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
pretrained_dir = "pretrained/google/electra-small-discriminator"
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='pretrained/google/electra-small-discriminator', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [4]:
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_dir)
print(repr(model.config))

Some weights of the model checkpoint at pretrained/google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at pretrained/google/electra-small-discriminator and are newly initialized: ['qa_outpu

ElectraConfig {
  "_name_or_path": "pretrained/google/electra-small-discriminator",
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [5]:
train = pd.read_parquet("input/train.parquet")
#train = train.sample(frac=0.07)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360000 entries, 0 to 359999
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Id                    360000 non-null  object
 1   is_multi              360000 non-null  int8  
 2   ground_truth          360000 non-null  object
 3   dataset_labels        360000 non-null  object
 4   is_impossible         360000 non-null  int8  
 5   answer_start          360000 non-null  int16 
 6   answer_end            360000 non-null  int16 
 7   context               360000 non-null  object
 8   context_token_length  360000 non-null  int16 
dtypes: int16(3), int8(2), object(4)
memory usage: 13.7+ MB


In [6]:
%%time
question = "what dataset"
questions = [question] * len(train)
enc = tokenizer(list(train["context"]), questions, padding="max_length")
print(f"{repr(enc.keys())}\nlen={len(enc['input_ids'])}")

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
len=360000
Wall time: 2min 38s


In [7]:
def add_token_positions(encodings, answer_start, answer_end, ids, is_impossible):
    start_positions = []
    end_positions = []
    for i in range(len(is_impossible)):
        j, k = 0, 0
        if is_impossible[i] == 0:
            j = encodings.char_to_token(i, answer_start[i])
            if j is None:
                #offsets = encodings["offset_mapping"][i]
                _id = ids[i]
                raise ValueError(f"start pos must not be None. i={i}, id={_id}, answer_start={answer_start[i]}")  #offsets={offsets}") 
            k = encodings.char_to_token(i, answer_end[i] - 1)
            if k is None:
                raise ValueError("end pos must not be None")
            if j > k:
                raise ValueError("start pos must be less than or equals end pos")
        start_positions.append(j)
        end_positions.append(k)
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [8]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [9]:
%%time
add_token_positions(
    enc, 
    answer_start=list(train["answer_start"]), 
    answer_end=list(train["answer_end"]),
    ids=list(train["Id"]),
    is_impossible=list(train["is_impossible"]),
)
train_ds = MyDataset(enc)

Wall time: 650 ms


In [10]:
del enc, questions
gc.collect()

80

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(repr(device))

device(type='cuda')


In [12]:
model.to(device)
model.train()
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
#lr=5e-4
optim = AdamW(model.parameters(), lr=1e-3)
epochs = 1
for epoch in range(epochs):
    loss_mean = 0
    steps = len(train_loader)
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss_mean += loss / steps
        loss.backward()
        optim.step()
    print(f"epoch={epoch}, loss={loss_mean:.4f}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22500/22500 [2:39:09<00:00,  2.36it/s]

epoch=0, loss=5.3871





In [13]:
model.eval()

ElectraForQuestionAnswering(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_featu

In [14]:
model.save_pretrained("output")

In [15]:
tokenizer.save_pretrained("output")

('output\\tokenizer_config.json',
 'output\\special_tokens_map.json',
 'output\\vocab.txt',
 'output\\added_tokens.json')

In [16]:
model = AutoModelForQuestionAnswering.from_pretrained("output")
print(repr(model.config))

ElectraConfig {
  "_name_or_path": "output",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [17]:
df = train.sample(30)
questions = [question] * 30
contexts = list(df["context"])
golds = list(df["dataset_labels"])
is_impossible = list(df["is_impossible"])

In [18]:
inputs = tokenizer(contexts, questions, truncation="only_first", padding="max_length", return_tensors="pt")
input_ids = inputs["input_ids"]
start_logits, end_logits = model(**inputs).values()
for i in range(len(start_logits)):    
    j = torch.argmax(start_logits[i])  
    k = torch.argmax(end_logits[i]) + 1
    a = "IMPOSSIBLE"
    if 0 < j < k:
        tokens = tokenizer.convert_ids_to_tokens(input_ids[i][j:k])
        a = tokenizer.convert_tokens_to_string(tokens)
    print(f"\n\nis_impossible={is_impossible[i]}\nq={questions[i]}\nc={contexts[i]}\ni={i}, j={j}, k={k}\na={a}\ng={golds[i]}")



is_impossible=1
q=what dataset
c=mats could be found on eelgrass beds and bare substrate without distinguishable spectral variation, creating a broader range of spectral signatures found within the bay. Furthermore, there are two species of eelgrass that thrive in Willapa Bay, each with a unique signature. North American eelgrass (Zostera Marina) is larger leafed than Asian eelgrass (Zostera Japonica) and tends to occupy the inter-tidal zone (Backman, 1991;Phillips, 1984). The lighter hue of Asian eelgrass could easily be detected on the tidal flats, but could not be clearly distinguished from North American eelgrass in the inter-tidal zone. The combination of the two eelgrass species in the inter-tidal zone provided a unique spectral signature of its own, leading to three different signatures of eelgrass. There were other unidentifiable spectral signatures for which no source could be located. Several regions in the northern end of the study area had received sediment deposition fol