In [1]:
import random
import glob
import os
import sys
import json
import math
import configparser
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
from typing import Callable

In [2]:
pd.set_option("use_inf_as_na", True)
pd.set_option("max_info_columns", 9999)
pd.set_option("display.max_columns", 9999)
pd.set_option("display.max_rows", 9999)
pd.set_option('max_colwidth', 9999)
seed = 31
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
INPUT = '/kaggle/input'
DATA = f'{INPUT}/coleridgeinitiative-show-us-the-data'
TEMP = '/kaggle/temp'
OUTPUT = '/kaggle/working'
RESOURCE_DIR = f'{INPUT}/coleridge-initiative-lib/kaggle-coleridge-initiative-1.0'
#TOK_DIR = f"{RESOURCE_DIR}/pretrained/google/electra-small-discriminator"
MODEL_DIR = f"{RESOURCE_DIR}/models/electra_small/20210621_1800"
sys.path.append(f'{INPUT}/sgcharts-ml/src')
sys.path.append(f'{RESOURCE_DIR}/src')
import mylib
import scml
from scml import nlp as snlp
scml.seed_everything()

In [4]:
model_max_length = 512
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, model_max_length=model_max_length)
print(f"{repr(tokenizer)}\n{tokenizer.model_input_names}")

PreTrainedTokenizerFast(name_or_path='/kaggle/input/coleridge-initiative-lib/kaggle-coleridge-initiative-1.0/models/electra_small/20210621_1800', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
['input_ids', 'token_type_ids', 'attention_mask']


In [5]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DIR)
print(repr(model.config))

ElectraConfig {
  "_name_or_path": "/kaggle/input/coleridge-initiative-lib/kaggle-coleridge-initiative-1.0/models/electra_small/20210621_1800",
  "architectures": [
    "ElectraForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": true,
  "transformers_version": "4.5.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}



In [6]:
sub = pd.read_csv(f"{DATA}/sample_submission.csv", engine="c", low_memory=False)
sub["PredictionString"] = sub["PredictionString"].astype(str)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                4 non-null      object
 1   PredictionString  4 non-null      object
dtypes: object(2)
memory usage: 192.0+ bytes


In [7]:
def qa_predict(
    data_dir: str,
    model: AutoModelForQuestionAnswering,
    tokenizer: AutoTokenizer,
    question: str,
    window_length: int,
    window_stride: int,
    max_windows: int,
    verbose: bool = False,
) -> Callable:
    def fn(row) -> str:
        rid = row["Id"]
        tmp = []
        with open(f"{data_dir}/{rid}.json") as in_file:
            sections = json.load(in_file)
        for section in sections:
            tmp.append(section["text"])
        text = " ".join(tmp).strip()
        text = snlp.to_ascii_str(text)
        i = 0
        j = i + window_length
        k = 0
        contexts = []
        while k < max_windows and len(text) - i >= window_stride:
            if verbose:
                print(f"i={i}, j={j}, k={k}")
            context = text[i:j]
            contexts.append(context)
            i += window_stride
            j = i + window_length
            k += 1
        questions = [question] * len(contexts)
        inputs = tokenizer(contexts, questions, truncation="only_first", padding="max_length", return_tensors="pt")
        input_ids = inputs["input_ids"]
        start_logits, end_logits = model(**inputs).values()
        if verbose:
            print(f"start_logits.size={start_logits.size()}, end_logits.size={end_logits.size()}")
        res = set()
        for k in range(len(start_logits)):
            i = torch.argmax(start_logits[k])  
            j = torch.argmax(end_logits[k]) + 1
            if 0 < i < j:
                tokens = tokenizer.convert_ids_to_tokens(input_ids[k][i:j])
                a = tokenizer.convert_tokens_to_string(tokens)
                a = mylib.clean_text(a)
                if verbose:
                    print(f"k={k}, i={i}, j={j}, a={a}, tokens={tokens}")
                # TODO if special token is present, discard answer (possibly truncated)
                res.add(a)
        return "|".join(res)

    return fn

# Inference

In [8]:
%%time
sub["PredictionString"] = sub.apply(
    qa_predict(
        data_dir=f"{DATA}/test",
        model=model,
        tokenizer=tokenizer,
        question="what dataset",
        window_length=2000,
        window_stride=1500,
        max_windows=30,
        verbose=False,
    ),
    axis=1,
)

CPU times: user 1min 17s, sys: 8.41 s, total: 1min 25s
Wall time: 44 s


In [9]:
sub.head()

Unnamed: 0,Id,PredictionString
0,2100032a-7c33-4bff-97ef-690822c43466,adni
1,2f392438-e215-4169-bebf-21ac4ff253e1,trends in international mathematics and science study
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


In [10]:
sub.to_csv("submission.csv", index = False)

# Debug

In [11]:
#!pip list