In [16]:
import json
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import spacy
from tqdm import tqdm
import scml
from scml import nlp as snlp
import mylib

In [17]:
tqdm.pandas()
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)

In [18]:
%%time
nlp = spacy.load("pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1")
nlp.path

CPU times: user 3.25 s, sys: 516 ms, total: 3.77 s
Wall time: 3.76 s


PosixPath('pretrained/spacy/en_core_web_lg/en_core_web_lg-2.3.1')

In [19]:
%%time
tokenizer = AutoTokenizer.from_pretrained("pretrained/bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained(
    "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
    from_tf=True,
)
print(repr(model.config))

All TF 2.0 model weights were used when initializing BertForQuestionAnswering.

Some weights of BertForQuestionAnswering were not initialized from the TF 2.0 model and are newly initialized: ['bert.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertConfig {
  "_name_or_path": "pretrained/bert-large-uncased-whole-word-masking-finetuned-squad",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

CPU times: user 15.7 s, sys: 7.8 s, total: 23.5 s
Wall time: 14.9 s


In [20]:
train = pd.read_csv("input/train.csv", engine="c", low_memory=False)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19661 entries, 0 to 19660
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Id             19661 non-null  object
 1   pub_title      19661 non-null  object
 2   dataset_title  19661 non-null  object
 3   dataset_label  19661 non-null  object
 4   cleaned_label  19661 non-null  object
dtypes: object(5)
memory usage: 768.1+ KB


In [21]:
def qa_predict(data_dir, nlp, model, tokenizer, question, stride):
    def fn(row) -> str:
        rid = row["Id"]
        tmp = []
        with open(f"{data_dir}/{rid}.json") as in_file:
            sections = json.load(in_file)
            for section in sections[:1]:
                tmp.append(section["text"])
        text = " ".join(tmp).strip()
        doc = nlp(text)
        sentences = [sent.string.strip() for sent in doc.sents]
        res = set()
        for i in range(0, len(sentences), stride):
            tmp = sentences[i:i + stride]
            #print(f"i={i}, len(tmp)={len(tmp)}")
            passage = " ".join(tmp)
            inputs = tokenizer.encode_plus(question, passage, add_special_tokens=True, return_tensors="pt")
            input_ids = inputs["input_ids"].tolist()[0]
            sep_index = input_ids.index(tokenizer.sep_token_id)
            answer_start_scores, answer_end_scores = model(**inputs).values()
            #print(f"answer_start_scores.shape={answer_start_scores.shape}, answer_end_scores.shape={answer_end_scores.shape}")
            ai = torch.argmax(answer_start_scores)
            aj = torch.argmax(answer_end_scores) + 1
            #print(f"ai={ai}, aj={aj}")
            if ai <= sep_index:
                continue
            a = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[ai:aj]))
            a = mylib.clean_text(a)
            if len(a) < 4 or len(a) > 150:
                continue
            n_digits = snlp.count_digit(a)
            if n_digits > 4 or n_digits / len(a) > 0.2:
                continue
            res.add(a)
        res = list(res)
        res.sort()
        return "|".join(res)
    
    return fn

In [22]:
%%time
train["PredictionString"] = train.progress_apply(
    qa_predict(
        data_dir=f"input/train",
        nlp=nlp,
        model=model,
        tokenizer=tokenizer,
        question="name dataset",
        stride=4,
    ),
    axis=1,
)

  5%|▍         | 900/19661 [20:20<7:04:05,  1.36s/it] 


KeyboardInterrupt: 

In [23]:
cols = ["Id", "cleaned_label", "PredictionString"]
train[cols].head()

KeyError: "['PredictionString'] not in index"

In [None]:
def matched(row):
    if mylib.jaccard(row["dataset_title"], row["cleaned_label"]):
        return 1
    return 0

In [None]:
train["jaccard"] = train.apply(jaccard_score, axis=1)