In [1]:
import json
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import spacy
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [2]:
tqdm.pandas()
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [3]:
%%time
nlp = spacy.load("pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1")
nlp.max_length = 1_000_000
nlp.path

CPU times: user 3.36 s, sys: 2.44 s, total: 5.8 s
Wall time: 5.83 s


PosixPath('pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1')

In [4]:
%%time
tokenizer = AutoTokenizer.from_pretrained("pretrained/bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained(
    "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
    from_tf=True,
)
print(repr(model.config))

All TF 2.0 model weights were used when initializing BertForQuestionAnswering.

Some weights of BertForQuestionAnswering were not initialized from the TF 2.0 model and are newly initialized: ['bert.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

CPU times: user 15.2 s, sys: 9.14 s, total: 24.4 s
Wall time: 15.7 s


In [5]:
train = pd.read_parquet("input/train.parquet")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14316 entries, 0 to 14315
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            14316 non-null  object
 1   ground_truth  14316 non-null  object
 2   is_multi      14316 non-null  int8  
dtypes: int8(1), object(2)
memory usage: 237.8+ KB


In [6]:
def qa_predict(data_dir, nlp, model, tokenizer, question, stride, window_length, n_window=None):
    def fn(row) -> str:
        rid = row["Id"]
        tmp = []
        with open(f"{data_dir}/{rid}.json") as in_file:
            sections = json.load(in_file)
            for section in sections:
                tmp.append(section["text"])
        text = " ".join(tmp).strip()
        if len(text) > nlp.max_length:
            text = text[:nlp.max_length]
        doc = nlp(text)
        sentences = [sent.string.strip() for sent in doc.sents]
        if n_window is not None:
            sentences = sentences[:window_length * n_window]
        res = set()
        for i in range(0, len(sentences), stride):
            tmp = sentences[i:i + window_length]
            #print(f"i={i}, len(tmp)={len(tmp)}")
            passage = " ".join(tmp)
            inputs = tokenizer.encode_plus(
                question, passage,
                truncation="only_second",
                max_length=512,
                add_special_tokens=True, 
                return_tensors="pt"
            )
            input_ids = inputs["input_ids"].tolist()[0]
            sep_index = input_ids.index(tokenizer.sep_token_id)
            answer_start_scores, answer_end_scores = model(**inputs).values()
            #print(f"answer_start_scores.shape={answer_start_scores.shape}, answer_end_scores.shape={answer_end_scores.shape}")
            ai = torch.argmax(answer_start_scores)
            aj = torch.argmax(answer_end_scores) + 1
            #print(f"ai={ai}, aj={aj}")
            if ai <= sep_index:
                continue
            a = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[ai:aj]))
            a = mylib.clean_text(a)
            if len(a) < 4 or len(a) > 150:
                continue
            n_digits = snlp.count_digit(a)
            if n_digits > 4 or n_digits / len(a) > 0.2:
                continue
            res.add(a)
        return "|".join(res)
    
    return fn

In [7]:
train["PredictionString"] = train.progress_apply(
    qa_predict(
        data_dir=f"input/train",
        nlp=nlp,
        model=model,
        tokenizer=tokenizer,
        question="name dataset",
        window_length=4,
        stride=4,
        n_window=1,
    ),
    axis=1,
)

100%|██████████| 14316/14316 [3:55:37<00:00,  1.01it/s]   


In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14316 entries, 0 to 14315
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                14316 non-null  object
 1   ground_truth      14316 non-null  object
 2   is_multi          14316 non-null  int8  
 3   PredictionString  14316 non-null  object
dtypes: int8(1), object(3)
memory usage: 349.6+ KB


In [10]:
train.head()

Unnamed: 0,Id,ground_truth,is_multi,PredictionString
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,education longitudinal study|national education longitudinal study,1,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,education longitudinal study|national education longitudinal study,1,national education longitudinal study of 1988
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,education longitudinal study|national education longitudinal study,1,
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,education longitudinal study|national education longitudinal study,1,federal reserve bank of richmond s1
4,c754dec7-c5a3-4337-9892-c02158475064,education longitudinal study|national education longitudinal study,1,national education longitudinal study nels


In [13]:
train.to_parquet("output/validation.parquet", index=False)

In [9]:
%%time
score = mylib.fbeta(y_true=train["ground_truth"], y_pred=train["PredictionString"])
print(f"score={score:.4f}")

score=0.0572
CPU times: user 125 ms, sys: 156 ms, total: 281 ms
Wall time: 183 ms


In [15]:
assert "|".join([]) == ""