In [146]:
import json
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import spacy
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [147]:
tqdm.pandas()
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
scml.seed_everything()

In [148]:
%%time
nlp = spacy.load("pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1")
nlp.max_length = 1_000_000
nlp.path

CPU times: user 4.14 s, sys: 500 ms, total: 4.64 s
Wall time: 4.66 s


PosixPath('pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1')

In [149]:
%%time
tokenizer = AutoTokenizer.from_pretrained("pretrained/bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained(
    "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
    from_tf=True,
)
print(repr(model.config))

All TF 2.0 model weights were used when initializing BertForQuestionAnswering.

Some weights of BertForQuestionAnswering were not initialized from the TF 2.0 model and are newly initialized: ['bert.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

CPU times: user 16 s, sys: 797 ms, total: 16.8 s
Wall time: 10.2 s


In [150]:
train = pd.read_parquet("input/train.parquet")
train = train.sample(1000)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 6004 to 6979
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            1000 non-null   object
 1   ground_truth  1000 non-null   object
 2   is_multi      1000 non-null   int8  
dtypes: int8(1), object(2)
memory usage: 24.4+ KB


In [151]:
def qa_predict(data_dir, nlp, model, tokenizer, question, stride, window_length, n_window=None):
    def fn(row) -> str:
        rid = row["Id"]
        tmp = []
        with open(f"{data_dir}/{rid}.json") as in_file:
            sections = json.load(in_file)
            for section in sections:
                tmp.append(section["text"])
        text = " ".join(tmp).strip()
        if len(text) > nlp.max_length:
            text = text[:nlp.max_length]
        doc = nlp(text)
        sentences = [sent.string.strip() for sent in doc.sents]
        if n_window is not None:
            sentences = sentences[:window_length * n_window]
        res = set()
        for i in range(0, len(sentences), stride):
            tmp = sentences[i:i + window_length]
            #print(f"i={i}, len(tmp)={len(tmp)}")
            passage = " ".join(tmp)
            inputs = tokenizer.encode_plus(
                question, passage,
                truncation="only_second",
                max_length=512,
                add_special_tokens=True, 
                return_tensors="pt"
            )
            input_ids = inputs["input_ids"].tolist()[0]
            sep_index = input_ids.index(tokenizer.sep_token_id)
            answer_start_scores, answer_end_scores = model(**inputs).values()
            #print(f"answer_start_scores.shape={answer_start_scores.shape}, answer_end_scores.shape={answer_end_scores.shape}")
            ai = torch.argmax(answer_start_scores)
            aj = torch.argmax(answer_end_scores) + 1
            #print(f"ai={ai}, aj={aj}")
            if ai <= sep_index:
                continue
            a = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[ai:aj]))
            a = mylib.clean_text(a)
            if len(a) < 4 or len(a) > 150:
                continue
            n_digits = snlp.count_digit(a)
            if n_digits > 4 or n_digits / len(a) > 0.2:
                continue
            res.add(a)
        return "|".join(res)
    
    return fn

In [152]:
train["PredictionString"] = train.progress_apply(
    qa_predict(
        data_dir=f"input/train",
        nlp=nlp,
        model=model,
        tokenizer=tokenizer,
        question="name of dataset database study survey program initiative model assessment archive collection catalog registry",
        window_length=4,
        stride=3,
        n_window=6,
    ),
    axis=1,
)

100%|██████████| 1000/1000 [1:06:43<00:00,  4.00s/it]


In [153]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 6004 to 6979
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                1000 non-null   object
 1   ground_truth      1000 non-null   object
 2   is_multi          1000 non-null   int8  
 3   PredictionString  1000 non-null   object
dtypes: int8(1), object(3)
memory usage: 32.2+ KB


In [154]:
train.head()

Unnamed: 0,Id,ground_truth,is_multi,PredictionString
6004,97701c3c-520a-4608-bd8c-8aa0b520b2b4,adni,0,
11427,3d0a9255-ed82-463e-9502-0721db008ccf,rural urban continuum codes,0,
8064,d26c90cd-a108-4613-bd30-6b22aeba1dc9,coastal change analysis program,0,
4292,64488cc2-7515-4385-a3b7-64c50767d526,adni|alzheimer s disease neuroimaging initiative adni,1,
4670,50e5ce99-d967-417f-b343-b30c2f8529c5,adni|alzheimer s disease neuroimaging initiative adni,1,


In [155]:
train.to_parquet("output/validation.parquet", index=False)

In [156]:
%%time
score = mylib.fbeta(y_true=train["ground_truth"], y_pred=train["PredictionString"])
print(f"score={score:.4f}")

score=0.0946
CPU times: user 15.6 ms, sys: 15.6 ms, total: 31.2 ms
Wall time: 14.7 ms


In [157]:
# 0.1746
# question="name dataset study survey program initiative",
# window_length=4,
# stride=3,
# n_window=6,

In [158]:
assert "|".join([]) == ""