In [14]:
import json
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import spacy
from tqdm import tqdm
from typing import Deque
import scml
from scml import nlp as snlp
import mylib

In [15]:
tqdm.pandas()
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
scml.seed_everything()

In [16]:
#%%time
#nlp = spacy.load("pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1")
#nlp.max_length = 1_000_000
#nlp.path

In [17]:
%%time
#pretrained_dir = "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad"
pretrained_dir = "pretrained/distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(pretrained_dir)
model = AutoModelForQuestionAnswering.from_pretrained(pretrained_dir, from_tf=True)
print(repr(model.config))

All TF 2.0 model weights were used when initializing DistilBertForQuestionAnswering.

All the weights of DistilBertForQuestionAnswering were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForQuestionAnswering for predictions without further training.


DistilBertConfig {
  "_name_or_path": "pretrained/distilbert-base-cased-distilled-squad",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": true,
  "tie_weights_": true,
  "transformers_version": "4.5.1",
  "vocab_size": 28996
}

CPU times: user 3.91 s, sys: 1.83 s, total: 5.73 s
Wall time: 3.55 s


In [18]:
train = pd.read_parquet("input/train.parquet")
train = train.sample(1000)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 11503 to 7243
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            1000 non-null   object
 1   ground_truth  1000 non-null   object
 2   is_multi      1000 non-null   int8  
dtypes: int8(1), object(2)
memory usage: 24.4+ KB


In [19]:
def _inputs(tokenizer, sentences, question, max_tokens):
    inputs = None
    prev = None
    _len = 0
    tmp = []
    while len(sentences) != 0 and _len < max_tokens:
        prev = inputs
        tmp.append(sentences[0])
        passage = " ".join(tmp)
        inputs = tokenizer.encode_plus(
            question, passage,
            truncation="only_second",
            max_length=max_tokens,
            add_special_tokens=True, 
            return_tensors="pt",
        )
        _len = len(inputs["input_ids"][0])
        if _len < max_tokens:
            sentences.popleft()
        #print(f"inputs={inputs}")
        #print(f"_len={_len}")
    if _len >= max_tokens and prev is not None:
        inputs = prev
    return inputs
    

def qa_predict(data_dir, model, tokenizer, question, n_window: int, 
               max_length: int = 1_000_000, max_tokens: int = 512):
    def fn(row) -> str:
        rid = row["Id"]
        tmp = []
        with open(f"{data_dir}/{rid}.json") as in_file:
            sections = json.load(in_file)
            for section in sections:
                tmp.append(section["text"])
        text = " ".join(tmp).strip()
        if len(text) == 0:
            print(f"len(text)=0, Id={rid}")
            return ""
        if len(text) > max_length:
            text = text[:max_length]
        sentences: Deque[str] = Deque(snlp.sentences(text))
        if len(sentences) == 0:
            print(f"len(sentences)=0, Id={rid}")
            return ""
        res = set()
        for _ in range(n_window):
            if len(sentences) == 0:
                break
            inputs = _inputs(tokenizer=tokenizer, sentences=sentences, question=question, max_tokens=max_tokens)
            input_ids = inputs["input_ids"].tolist()[0]
            sep_index = input_ids.index(tokenizer.sep_token_id)
            answer_start_scores, answer_end_scores = model(**inputs).values()
            #print(f"answer_start_scores.shape={answer_start_scores.shape}, answer_end_scores.shape={answer_end_scores.shape}")
            ai = torch.argmax(answer_start_scores)
            aj = torch.argmax(answer_end_scores) + 1
            #print(f"ai={ai}, aj={aj}")
            if ai <= sep_index:
                continue
            a = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[ai:aj]))
            a = mylib.clean_text(a)
            if len(a) < 4 or len(a) > 150:
                continue
            n_digits = snlp.count_digit(a)
            if n_digits > 4 or n_digits / len(a) > 0.2:
                continue
            res.add(a)
        return "|".join(res)
    
    return fn

In [20]:
train["PredictionString"] = train.progress_apply(
    qa_predict(
        data_dir=f"input/train",
        #nlp=nlp,
        model=model,
        tokenizer=tokenizer,
        question="name of study",
        #question="name dataset study survey program initiative",
        #question="name of dataset database study survey program initiative model assessment archive collection catalog registry",
        #window_length=4,
        #stride=3,
        n_window=6,
    ),
    axis=1,
)

100%|██████████| 1000/1000 [24:43<00:00,  1.48s/it]


In [21]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 11503 to 7243
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                1000 non-null   object
 1   ground_truth      1000 non-null   object
 2   is_multi          1000 non-null   int8  
 3   PredictionString  1000 non-null   object
dtypes: int8(1), object(3)
memory usage: 32.2+ KB


In [22]:
train.head()

Unnamed: 0,Id,ground_truth,is_multi,PredictionString
11503,72897d3a-7abb-486f-94da-139bfadbe40d,agricultural resource management survey,0,
2776,bf81c91c-7ef7-4e17-86d5-128baccdf0bc,early childhood longitudinal study,0,
1508,f3c6ab46-7e14-4f07-b316-ee2b708eb2e3,common core of data|nces common core of data,1,
1399,a3c6594b-854d-4ec2-b30f-7ff5ffe4ab09,ibtracs|international best track archive for climate stewardship,1,
3968,000efc17-13d8-433d-8f62-a3932fe4f3b8,adni|alzheimer s disease neuroimaging initiative adni,1,


In [23]:
train.to_parquet("output/validation.parquet", index=False)

In [24]:
%%time
score = mylib.fbeta(y_true=train["ground_truth"], y_pred=train["PredictionString"])
print(f"score={score:.4f}")

score=0.0467
CPU times: user 0 ns, sys: 15.6 ms, total: 15.6 ms
Wall time: 14.8 ms


In [25]:
# 0.1746
# question="name dataset study survey program initiative",
# window_length=4,
# stride=3,
# n_window=6,

In [26]:
assert "|".join([]) == ""