This notebook gives a simple combination of literal matching and Named Entity Recognition using BERT (base model from huggingface).

The training phase of the BERT model was done in another kernel: Pytorch BERT for Named Entity Recognition.

In [None]:
LENGTH = 1
FL_TH = 1.0
PRETRAINED = '../input/coleridge-bert-ner-external-pseudo-labels'
HOW = ['W_MLM'] # 'NER_ONLY', 'W_QA', 'W_MLM', 'W_MATCH', 'DOUBLE_BLENDING'
W_QA_TH = 0.75

ADNL_GOVT_LABELS_PATH = '../input/coleridge-additional-datasets/additional_datasets_v6.csv'

MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

PREDICT_BATCH = 64000 

SEED = 42

# Setting

In [None]:
!pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
!pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
!pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

import os
import re
import json
import time
import random
import glob
import importlib

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import tokenizers
from tokenizers import *
import transformers
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, \
AutoModelForMaskedLM, Trainer, TrainingArguments, pipeline
from datasets import load_dataset

from functools import partial
from tqdm.notebook import tqdm
from torch.nn import functional as F
from sklearn.model_selection import StratifiedKFold

from IPython.display import clear_output

clear_output()

In [None]:
# https://huggingface.co/transformers/_modules/transformers/trainer_utils.html
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    # ^^ safe to call this function even if cuda is not available
    
    print(f'Setted Pipeline SEED = {SEED}')


set_seed(SEED)

# Load Data

In [None]:
sample_submission_path = '../input/coleridgeinitiative-show-us-the-data/sample_submission.csv'
sample_submission = pd.read_csv(sample_submission_path)

paper_test_folder = '../input/coleridgeinitiative-show-us-the-data/test'
papers = {}
for paper_id in sample_submission['Id']:
    with open(f'{paper_test_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Bert NER Prediction

## Transform data to NER format
Group by publication, training labels should have the same form as expected output.

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

def totally_clean_text(txt):
    txt = clean_text(txt)
    txt = re.sub(' +', ' ', txt)
    return txt

def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
test_rows = [] # test data in NER format
paper_length = [] # store the number of sentences each paper has

for paper_id in sample_submission['Id']:
    # load paper
    paper = papers[paper_id]
    
    # extract sentences
    sentences = [clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.')
                ]
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > LENGTH] # only accept sentences with length > 10 chars
    sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        
    # collect all sentences in json
    for sentence in sentences:
        sentence_words = sentence.split()
        dummy_tags = ['O']*len(sentence_words)
        test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})
    
    # track which sentence belongs to which data point
    paper_length.append(len(sentences))
    
print(f'total number of sentences w/ length{LENGTH}: {len(test_rows)}')

## Do Predict And Collect Results

In [None]:
PRETRAINED_PATH = os.path.join(PRETRAINED, 'output')
TRAIN_PATH = os.path.join(PRETRAINED, 'train_ner.json')
VAL_PATH = os.path.join(PRETRAINED, 'train_ner.json')
TEST_INPUT_SAVE_PATH = './input_data'
TEST_NER_DATA_FILE = 'test_ner_input.json'
PREDICTION_SAVE_PATH = './pred'
PREDICTION_FILE = 'test_predictions.txt'

os.environ["MODEL_PATH"] = f"{PRETRAINED_PATH}"
os.environ["TRAIN_FILE"] = f"{TRAIN_PATH}"
os.environ["VALIDATION_FILE"] = f"{VAL_PATH}"
os.environ["TEST_FILE"] = f"{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}"
os.environ["OUTPUT_DIR"] = f"{PREDICTION_SAVE_PATH}"

In [None]:
# copy my_seqeval.py to the working directory because the input directory is non-writable
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# make necessart directories and files
os.makedirs(TEST_INPUT_SAVE_PATH, exist_ok=True)

In [None]:
def bert_predict():
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --train_file "$TRAIN_FILE" \
    --validation_file "$VALIDATION_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 42 \
    --do_predict

In [None]:
bert_outputs = []

for batch_begin in range(0, len(test_rows), PREDICT_BATCH):
    # write data rows to input file
    with open(f'{TEST_INPUT_SAVE_PATH}/{TEST_NER_DATA_FILE}', 'w') as f:
        for row in test_rows[batch_begin:batch_begin+PREDICT_BATCH]:
            json.dump(row, f)
            f.write('\n')
    
    # remove output dir
    !rm -r "$OUTPUT_DIR"
    
    # do predict
    bert_predict()
    
    # read predictions
    with open(f'{PREDICTION_SAVE_PATH}/{PREDICTION_FILE}') as f:
        this_preds = f.read().split('\n')[:-1]
        bert_outputs += [pred.split() for pred in this_preds]

## Restore Dataset Labels From Predictions

In [None]:
# get test sentences
test_sentences = [row['tokens'] for row in test_rows]

del test_rows

In [None]:
bert_dataset_labels = [] # store all dataset labels for each publication

for length in paper_length:
    labels = set()
    for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
        curr_phrase = ''
        for word, tag in zip(sentence, pred):
            if tag == 'B': # start a new phrase
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
                curr_phrase = word
            elif tag == 'I' and curr_phrase: # continue the phrase
                curr_phrase += ' ' + word
            else: # end last phrase (if any)
                if curr_phrase:
                    labels.add(curr_phrase)
                    curr_phrase = ''
        # check if the label is the suffix of the sentence
        if curr_phrase:
            labels.add(curr_phrase)
            curr_phrase = ''
    
    # record dataset labels for this publication
    bert_dataset_labels.append(labels)
    
    del test_sentences[:length], bert_outputs[:length]

bert_dataset_labels[:4]

## Filter Based On Jaccard Score And Clean

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
filtered_bert_labels = []

for labels in bert_dataset_labels:
    filtered = []
    
    for label in sorted(labels, key=len):
        label = clean_text(label)
        if len(filtered) == 0 or all(jaccard(label, got_label) < FL_TH for got_label in filtered):
            filtered.append(label)
    
    filtered_bert_labels.append('|'.join(filtered))

print(f'filtered_bert_labels[:4] w/ FL{FL_TH}:')
filtered_bert_labels[:4]

# Match

In [None]:
def read_json_pub(filename, train_data_path=paper_test_folder, output='text'):
    json_path = os.path.join(train_data_path, (filename+'.json'))
    headings = []
    contents = []
    combined = []
    
    with open(json_path, 'r') as f:
        json_decode = json.load(f)
        for data in json_decode:
            headings.append(data.get('section_title'))
            contents.append(data.get('text'))
            combined.append(data.get('section_title'))
            combined.append(data.get('text'))
    
    all_headings = ' '.join(headings)
    all_contents = ' '.join(contents)
    all_data = '. '.join(combined)
    
    if output == 'text':
        return all_contents
    elif output == 'head':
        return all_headings
    else:
        return all_data
    
    
def text_cleaning(text):
    '''
    Converts all text to lower case, Removes special charecters, emojis and multiple spaces
    text - Sentence that needs to be cleaned
    '''
    text = re.sub('[^A-Za-z0-9]+', ' ', str(text).lower()).strip()
    text = re.sub(' +', ' ', text)
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    return text

In [None]:
if 'W_MATCH' in HOW :
    adnl_govt_labels = pd.read_csv(ADNL_GOVT_LABELS_PATH)

    literal_preds = []
    to_append = []
    for index, row in tqdm(sample_submission.iterrows()):
        to_append = [row['Id'],'']
        large_string = str(read_json_pub(row['Id'], paper_test_folder))
        clean_string = text_cleaning(large_string)
        for index, row2 in adnl_govt_labels.iterrows():
            query_string = str(row2['title'])
            if query_string in clean_string:
                if to_append[1] != '' and clean_text(query_string) not in to_append[1]:
                    to_append[1] = to_append[1] + '|' + clean_text(query_string)
                if to_append[1] == '':
                    to_append[1] = clean_text(query_string)
        literal_preds.append(*to_append[1:])

else: literal_preds = None

literal_preds

# Parameters Check

In [None]:
print(f'LENGTH = {LENGTH}')
print(f'FL_TH = {FL_TH}')
print(f'PRETRAINED = {PRETRAINED}')
print(f'HOW = {HOW}')
if 'W_MATCH' in HOW: print(f'ADNL_GOVT_LABELS_PATH = {ADNL_GOVT_LABELS_PATH}')
if 'W_QA' in HOW: print(f'W_QA_TH = {W_QA_TH}')

|   | CV | LB |
| - | -- | -- |
| NER only |   | 0.382 |
| NER only FL1.0 |   | 0.388 |
| NERv1 external+Pseudo length1 only FL1.0 (bug) |   | 0.397 |
| NERv1 external+Pseudo length1 only FL1.0 (fixed) |   | 0.397 |
| NERv1 external+Pseudo length1 FL1.0 W_MATCH |   | 0.583 |
| (NERv1 external+Pseudo length1 FL1.0 + QA 0.995) 0.75 |   | 0.449 |
| (NERv1 external+Pseudo length1 FL1.0 + QA 0.995) 0.75 WM |   | 0.582 |
| (NERv2 external+Pseudo length1 FL1.0 + QA 0.995) 0.75 WM DB |   | 0.560 |
| (NERv2 external+Pseudo length1 FL1.0 + QA 0.995) 0.75 |   | 0.451 |
| **(NERv2 external+Pseudo length1 FL1.0 + QA 0.995) 0.75 + MLMv1 external-pseudo length1 FOPP FL1.0** |   | **0.453** |

# Bert QA

In [None]:
THRESHOLD = 0.995
CP_PATH = '../input/bert-for-question-answering-baseline-training' + '/'

SEED = 2020

DATA_PATH = "../input/coleridgeinitiative-show-us-the-data/"
DATA_PATH_TRAIN = DATA_PATH + 'train/'
DATA_PATH_TEST = DATA_PATH + 'test/'

NUM_WORKERS = 4

VOCABS = {
    "bert-base-uncased": "../input/vocabs/bert-base-uncased-vocab.txt",
}

MODEL_PATHS = {
    'bert-base-uncased': '../input/bertconfigs/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/',
    'bert-large-uncased-whole-word-masking-finetuned-squad': '../input/bertconfigs/wwm_uncased_L-24_H-1024_A-16/wwm_uncased_L-24_H-1024_A-16/',
    'albert-large-v2': '../input/albert-configs/albert-large-v2/albert-large-v2/',
    'albert-base-v2': '../input/albert-configs/albert-base-v2/albert-base-v2/',
    'distilbert': '../input/albert-configs/distilbert/distilbert/',
}

In [None]:
class Config:
    # General
    k = 5
    seed = 2021

    # Texts
    max_len = 256
    
    # Architecture
    selected_model = "bert-base-uncased"
    lowercase = True
    
    # Training
    batch_size = 16
    batch_size_val = batch_size * 2

In [None]:
class EncodedText:
    def __init__(self, ids, offsets):
        self.ids = ids
        self.offsets = offsets


def create_tokenizer_and_tokens(config):
    if "roberta" in config.selected_model:
        raise NotImplementedError
        
    elif "albert" in config.selected_model:
        raise NotImplementedError
        
    else:
        tokenizer = BertWordPieceTokenizer(
            MODEL_PATHS[config.selected_model] + 'vocab.txt',
            lowercase=config.lowercase,
        )

        tokens = {
            'cls': tokenizer.token_to_id('[CLS]'),
            'sep': tokenizer.token_to_id('[SEP]'),
            'pad': tokenizer.token_to_id('[PAD]'),
        }
    
    return tokenizer, tokens

In [None]:
def load_text(id_, root=""):
    with open(os.path.join(root, id_ + ".json")) as f:
        text = json.load(f)
    return text


def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()


def locate_label_string(text, label):
    """
    Finds the label in the text
    """
    len_label = len(label) - 1

    candidates_idx = [i for i, e in enumerate(text) if e == label[1]]
    for idx in candidates_idx:
        if " " + text[idx: idx + len_label] == label:
            idx_start = idx
            idx_end = idx + len_label
            break

    assert (
        text[idx_start:idx_end] == label[1:]
    ), f'"{text[idx_start: idx_end]}" instead of "{label}" in "{text}"'

    char_targets = np.zeros(len(text))
    char_targets[idx_start:idx_end] = 1

    return idx_start, idx_end, char_targets


def locate_label_tokens(offsets, char_targets):
    """
    Finds the tokens corresponding to the found labels
    """
    target_idx = []
    for idx, (offset1, offset2) in enumerate(offsets):
        if sum(char_targets[offset1:offset2]) > 0:
            target_idx.append(idx)

    if not len(target_idx):
        for idx, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1:offset2]) > 0:
                target_idx.append(idx)

    return target_idx[0], target_idx[-1]

In [None]:
def process_data(
    text,
    label,
    tokenizer,
    tokens,
    max_len=100,
    model_name="bert",
):
    """
    Prepares the data for the question answering task.
    Adapted from Abishek's work on the Tweet Sentiment extraction competition, 
    check his work for more details !
    """
    target_start, target_end = 0, 0
    text = " " + " ".join(str(text).split())
    label = " " + " ".join(str(label).split())

    if label != " ":
        idx_start, idx_end, char_targets = locate_label_string(
            text, label
        )

    tokenized = tokenizer.encode(text)
    input_ids_text = tokenized.ids[1:-1]

    # print(input_ids_text, len(input_ids_text))

    offsets = tokenized.offsets[1:-1]

    if label != " ":
        target_start, target_end = locate_label_tokens(offsets, char_targets)

    if target_end >= max_len - 2:  # target is too far in the sentence, we crop its beginning.
        n_tok_to_crop = target_start - max_len // 2
        new_str_start = offsets[n_tok_to_crop][0]

        input_ids_text = input_ids_text[n_tok_to_crop:]

        offsets = [tuple(t) for t in np.array(offsets[n_tok_to_crop:]) - new_str_start]
        text = text[new_str_start:]

        target_start -= n_tok_to_crop
        target_end -= n_tok_to_crop

    input_ids = (
        [tokens["cls"]]
        + input_ids_text[:max_len - 2]
        + [tokens["sep"]]
    )

    if "roberta" in model_name:
        token_type_ids = [0] * len(input_ids)
    else:
        token_type_ids = [1] * len(input_ids)

    text_offsets = [(0, 0)] + offsets[:max_len - 2] + [(0, 0)]

    target_start += 1
    target_end += 1

    # target_end = min(target_end, max_len - 1)

    assert len(input_ids) == len(token_type_ids) and len(input_ids) == len(text_offsets), (len(input_ids), len(text_offsets))  # noqa

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([tokens["pad"]] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        text_offsets = text_offsets + ([(0, 0)] * padding_length)

    return {
        "ids": input_ids,
        "token_type_ids": token_type_ids,
        "targets_start": target_start,
        "targets_end": target_end,
        "text": text,
        "label": label,
        "offsets": text_offsets,
    }

In [None]:
from torch.utils.data import Dataset

class ArticleDataset(Dataset):
    """
    Dataset for inference. 
    """
    def __init__(
        self,
        id_,
        tokenizer,
        tokens,
        max_len=512,
        words_per_split=300,
        margin=10,
        model_name="bert",
        root=""
    ):
        self.tokens = tokens
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.model_name = model_name
        self.words_per_split = words_per_split
        self.margin = margin

        self.article = load_text(id_, root=root)
        
        self.texts = self.article_to_texts()

    def __len__(self):
        return len(self.texts)
    
    def article_to_texts(self):
        """
        Each article is divided into sections, 
        and then into subsets of self.words_per_split words
        """
        texts = []
        for section in self.article:
            clean_section = clean_text(section['text']).split(' ')[:5000]  # only keep first 5k words
            
            for i in range(len(clean_section) // self.words_per_split + 1):
                start = max(0, self.words_per_split * i - self.margin)
                end = self.words_per_split * (i + 1) + self.margin
                text = " ".join(clean_section[start: end])
                texts.append(text)
            
        return texts

    def __getitem__(self, idx):
        data = process_data(
            self.texts[idx],
            "",
            self.tokenizer,
            self.tokens,
            max_len=self.max_len,
            model_name=self.model_name,
        )

        return {
            "ids": torch.tensor(data["ids"], dtype=torch.long),
            "token_type_ids": torch.tensor(data["token_type_ids"], dtype=torch.long),
            "target_start": torch.tensor(data["targets_start"], dtype=torch.long),
            "target_end": torch.tensor(data["targets_end"], dtype=torch.long),
            "text": data["text"],
            "label": data["label"],
            "offsets": torch.tensor(data["offsets"], dtype=torch.long),
        }

In [None]:
from transformers import BertModel, BertConfig

TRANSFORMERS = {   
    "bert-base-uncased": (BertModel, "bert-base-uncased", BertConfig),
}


class QATransformer(nn.Module):
    """
    Simple model for Question Answering
    """
    def __init__(self, model):
        super().__init__()
        self.name = model

        self.pad_idx = 1 if "roberta" in self.name else 0

        model_class, _, config_class = TRANSFORMERS[model]

        try:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        except:
            config = config_class.from_json_file(MODEL_PATHS[model] + 'config.json')
        config.output_hidden_states = True

        self.transformer =  model_class(config)

        self.nb_features = self.transformer.pooler.dense.out_features

        self.logits = nn.Sequential(
            nn.Linear(self.nb_features, self.nb_features),
            nn.Tanh(),
            nn.Linear(self.nb_features, 2),
        )

    def forward(self, tokens, token_type_ids):
        """
        Usual torch forward function

        Arguments:
            tokens {torch tensor} -- Sentence tokens
            token_type_ids {torch tensor} -- Sentence tokens ids
        """

        hidden_states = self.transformer(
            tokens,
            attention_mask=(tokens != self.pad_idx).long(),
            token_type_ids=token_type_ids,
        )[-1]

        features = hidden_states[-1]
        logits = self.logits(features)

        start_logits, end_logits = logits[:, :, 0], logits[:, :, 1]

        return start_logits, end_logits

In [None]:
def load_model_weights(model, filename, verbose=1, cp_folder=""):
    """
    Loads the weights of a PyTorch model. The exception handles cpu/gpu incompatibilities.

    Args:
        model (torch model): Model to load the weights to.
        filename (str): Name of the checkpoint.
        verbose (int, optional): Whether to display infos. Defaults to 1.
        cp_folder (str, optional): Folder to load from. Defaults to "".

    Returns:
        torch model: Model with loaded weights.
    """

    if verbose:
        print(f"\n -> Loading weights from {os.path.join(cp_folder,filename)}\n")
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=True)
    except BaseException:
        model.load_state_dict(
            torch.load(os.path.join(cp_folder, filename), map_location="cpu"),
            strict=True,
        )
    return model

In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader


def predict(model, dataset, batch_size=32):
    """
    Usual predict torch function

    Arguments:
        model {torch model} -- Model to predict with
        dataset {torch dataset} -- Dataset to get predictions from

    Keyword Arguments:
        batch_size {int} -- Batch size (default: {32})

    Returns:
        numpy array -- Predictions
    """

    model.eval()
    start_probas = []
    end_probas = []

    loader = DataLoader(
        dataset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS
    )

    with torch.no_grad():
        for data in loader:
            ids, token_type_ids = data["ids"], data["token_type_ids"]

            start_logits, end_logits = model(
                ids.cuda(), token_type_ids.cuda()
            )

            start_probs = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
            end_probs = torch.softmax(end_logits, dim=1).cpu().detach().numpy()

            for s, e in zip(start_probs, end_probs):
                start_probas.append(list(s))
                end_probas.append(list(e))

    return start_probas, end_probas

In [None]:
def get_string_from_idx(text, idx_start, idx_end, offsets):
    """
    Uses the offsets to retrieve the predicted string based on the start and end indices
    """
    if idx_end < idx_start:
        idx_end = idx_start

    predicted_string = ""
    for i in range(idx_start, idx_end + 1):
        predicted_string += text[offsets[i][0]: offsets[i][1]]
        if i + 1 < len(offsets) and offsets[i][1] < offsets[i + 1][0]:
            predicted_string += " "

    return predicted_string


def get_pred_from_probas(dataset, start_probas, end_probas, threshold=0.):
    preds = []
    for i in range(len(dataset)):
        if start_probas[i].max() > threshold or end_probas[i].max() > threshold:
            start_idx = np.argmax(start_probas[i])
            end_idx = np.argmax(end_probas[i])
            if start_idx < end_idx and end_idx - start_idx < 10:
                # print(start_idx, end_idx)
                data = dataset[i]
                preds.append(get_string_from_idx(data["text"], start_idx, end_idx, data["offsets"]))

    return preds

In [None]:
def post_process(preds):
    """
    Naive processing of prediction : 
    Remove duplicates and convert to expected format.
    """
    preds = np.unique(preds)
    return "|".join(preds)


def k_fold_inference(config, df, tokenizer, tokens, weights, threshold=0.):
    models = []
    for w in weights:
        model = QATransformer(config.selected_model).cuda()
        model.zero_grad()
        load_model_weights(model, w)
        models.append(model)

    preds = []
    for text_id in tqdm(df['Id']):

        dataset = ArticleDataset(
            text_id,
            tokenizer,
            tokens,
            max_len=512,
            model_name="bert",
            root=DATA_PATH_TEST
        )

        start_probas, end_probas = [], []
        for model in models:
            start_proba, end_proba = predict(
                model, 
                dataset, 
                batch_size=config.batch_size_val, 
            )
            start_probas.append(start_proba)
            end_probas.append(end_proba)

        start_probas = np.mean(start_probas, 0)
        end_probas = np.mean(end_probas, 0)
        
        # here can do some FOPP
        pred = get_pred_from_probas(dataset, start_probas, end_probas, threshold=threshold)
        preds.append(post_process(pred))
            
    return preds

In [None]:
config = Config
df = pd.read_csv(DATA_PATH + 'sample_submission.csv')

In [None]:
tokenizer, tokens = create_tokenizer_and_tokens(config)

In [None]:
dataset = ArticleDataset(
    df['Id'][0],
    tokenizer,
    tokens,
    max_len=512,
    model_name="bert",
    root=DATA_PATH_TEST,
)

In [None]:
weights = [sorted(glob.glob(CP_PATH + "*.pt"))[-1]] # -> list(model_paths)
weights

In [None]:
qa_preds_model = k_fold_inference(
    config,
    df,
    tokenizer,
    tokens,
    weights,
    threshold=THRESHOLD,
)

qa_preds_model[:4]

# MLM

In [None]:
PRED_TH = 2.0
FL_TH = 1.0

MODEL_PATH_PREFIX = '../input/coleridge-bert-mlm-external-pseudo-labels-v1'
MLM_PRETRAINED_PATH = 'mlm-model'
TOKENIZER = 'model_tokenizer'

LENGTH = 1
MAX_LENGTH = 64
OVERLAP = 20

PREDICT_BATCH = 32 # a higher value requires higher GPU memory usage

DATASET_SYMBOL = '$' # this symbol represents a dataset name
NONDATA_SYMBOL = '#' # this symbol represents a non-dataset name

In [None]:
if 'W_MLM' in HOW:
    TOKENIZER_PATH = os.path.join(MODEL_PATH_PREFIX, TOKENIZER)
    PRETRAINED_PATH = os.path.join(MODEL_PATH_PREFIX, MLM_PRETRAINED_PATH)
    
    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, use_fast=True)
    model = AutoModelForMaskedLM.from_pretrained(PRETRAINED_PATH)

    mlm = pipeline(
        'fill-mask', 
        model=model,
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1
    )

In [None]:
def jaccard_similarity(s1, s2):
    l1 = s1.split(" ")
    l2 = s2.split(" ")    
    intersection = len(list(set(l1).intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

def clean_paper_sentence(s):
    """
    This function is essentially clean_text without lowercasing.
    """
    s = re.sub('[^A-Za-z0-9]+', ' ', str(s)).strip()
    s = re.sub(' +', ' ', s)
    return s

def shorten_sentences(sentences):
    """
    Sentences that have more than MAX_LENGTH words will be split
    into multiple sentences with overlappings.
    """
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

connection_tokens = {'s', 'of', 'and', 'in', 'on', 'for', 'data', 'dataset'}
def find_mask_candidates(sentence):
    """
    Extract masking candidates for Masked Dataset Modeling from a given $sentence.
    A candidate should be a continuous sequence of at least 2 words, 
    each of these words either has the first letter in uppercase or is one of
    the connection words ($connection_tokens). Furthermore, the connection 
    tokens are not allowed to appear at the beginning and the end of the
    sequence.
    """
    def candidate_qualified(words):
        while len(words) and words[0].lower() in connection_tokens:
            words = words[1:]
        while len(words) and words[-1].lower() in connection_tokens:
            words = words[:-1]
        
        return len(words) >= 2
    
    candidates = []
    
    phrase_start, phrase_end = -1, -1
    for id in range(1, len(sentence)):
        word = sentence[id]
        if word[0].isupper() or word in connection_tokens:
            if phrase_start == -1:
                phrase_start = phrase_end = id
            else:
                phrase_end = id
        else:
            if phrase_start != -1:
                if candidate_qualified(sentence[phrase_start:phrase_end+1]):
                    candidates.append((phrase_start, phrase_end))
                phrase_start = phrase_end = -1
    
    if phrase_start != -1:
        if candidate_qualified(sentence[phrase_start:phrase_end+1]):
            candidates.append((phrase_start, phrase_end))
    
    return candidates

In [None]:
if 'W_MLM' in HOW:
    mask = mlm.tokenizer.mask_token
    all_test_data = []
    
    for paper_id in tqdm(sample_submission['Id']):
        # load paper
        paper = papers[paper_id]

        # extract sentences
        sentences = set([clean_paper_sentence(sentence) for section in paper 
                         for sentence in section['text'].split('.')
                        ])
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > LENGTH] # only accept sentences with length > 10 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]
        sentences = [sentence.split() for sentence in sentences] # sentence = list of words

        # mask
        test_data = []
        for sentence in sentences:
            for phrase_start, phrase_end in find_mask_candidates(sentence):
                dt_point = sentence[:phrase_start] + [mask] + sentence[phrase_end+1:]
                test_data.append((' '.join(dt_point), ' '.join(sentence[phrase_start:phrase_end+1]))) # (masked text, phrase)

        all_test_data.append(test_data)

In [None]:
if 'W_MLM' in HOW:
    pred_mlm_labels = []

    for test_data in tqdm(all_test_data):
        pred_bag = set()

        if len(test_data):
            texts, phrases = list(zip(*test_data))
            mlm_pred = []
            for p_id in range(0, len(texts), PREDICT_BATCH):
                batch_texts = texts[p_id:p_id+PREDICT_BATCH]
                batch_pred = mlm(list(batch_texts), targets=[f' {DATASET_SYMBOL}', f' {NONDATA_SYMBOL}'])
                
                if len(batch_texts) == 1:
                    batch_pred = [batch_pred]
                    
                mlm_pred.extend(batch_pred)

            for pred_th in np.arange(PRED_TH, 1.0, -0.05): # find-more PP
                if len(pred_bag) == 0:
                    for (result1, result2), phrase in zip(mlm_pred, phrases):
                        if (result1['score'] > result2['score'] * pred_th and result1['token_str'] == DATASET_SYMBOL) or\
                           (result2['score'] > result1['score'] * pred_th and result2['token_str'] == NONDATA_SYMBOL):
                            pred_bag.add(clean_text(phrase))
                else: break

        # filter labels by jaccard score 
        filtered_labels = []
        for label in sorted(pred_bag, key=len, reverse=True): # long to short so that we keep the potential best
            if len(filtered_labels) == 0 or all(jaccard_similarity(label, got_label) < FL_TH for got_label in filtered_labels):
                filtered_labels.append(label)

        pred_mlm_labels.append('|'.join(filtered_labels))
        
    print(f'pred_mlm_labels[:4] w/ PT{PRED_TH} FL{FL_TH}: \n{pred_mlm_labels[:4]}')
        
else: print('MLM is not used.')

# Blending

In [None]:
def model_pred_blending(major_pred: list, second_pred: list) -> list:
    final_predictions = []
    for pred_ner, perd_qa in tqdm(zip(major_pred, second_pred)):
        # NER
        if pred_ner and not perd_qa:
            final_predictions.append(pred_ner)
        # QA 
        elif perd_qa and not pred_ner:
            final_predictions.append(perd_qa)
        # All
        elif pred_ner and perd_qa:
            ner_labels, qa_labels = pred_ner.split('|'), perd_qa.split('|')
            all_temp = ner_labels.copy()
            for qa_label in qa_labels:
                if all( jaccard(ner_label, qa_label) < W_QA_TH for ner_label in ner_labels ):
                    all_temp.append(qa_label)
            final_predictions.append('|'.join(all_temp))
        # None
        else:
            final_predictions.append(pred_ner)
    return final_predictions

In [None]:
final_predictions = []

if 'NER_ONLY' in HOW:
    final_predictions = filtered_bert_labels

elif 'W_QA' in HOW:
    if 'W_MATCH' not in HOW: literal_preds = [None] * len(filtered_bert_labels)
    if 'DOUBLE_BLENDING' in HOW:
        ner_qa_predictions = model_pred_blending(filtered_bert_labels, qa_preds_model)
        final_predictions = model_pred_blending(literal_preds, ner_qa_predictions)
    else:
        for pred_ner, perd_qa, literal_match in tqdm(zip(filtered_bert_labels, qa_preds_model, literal_preds)):
            # MATCH
            if literal_match:
                final_predictions.append(literal_match)
            # NER
            elif pred_ner and not perd_qa:
                final_predictions.append(pred_ner)
            # QA 
            elif perd_qa and not pred_ner:
                final_predictions.append(perd_qa)
            # All
            elif pred_ner and perd_qa:
                ner_labels, qa_labels = pred_ner.split('|'), perd_qa.split('|')
                all_temp = ner_labels.copy()
                for qa_label in qa_labels:
                    if all( jaccard(ner_label, qa_label) < W_QA_TH for ner_label in ner_labels ):
                        all_temp.append(qa_label)
                final_predictions.append('|'.join(all_temp))
            # None
            else:
                final_predictions.append(pred_ner)
    
elif 'W_MATCH' in HOW:
    for literal_match, ner_pred in zip(literal_preds, filtered_bert_labels):
        if literal_match:
            final_predictions.append(literal_match)
        else:
            final_predictions.append(ner_pred)

elif 'W_MLM' in HOW:
    for ner_pred, mlm_pred in zip(filtered_bert_labels, pred_mlm_labels):
        if ner_pred:
            final_predictions.append(ner_pred)
        else:
            final_predictions.append(mlm_pred)
            
# if 'W_MLM' in HOW:
#     for index, (ner_qa_pred, mlm_pred) in enumerate( zip(final_predictions, pred_mlm_labels) ):
#         if not ner_qa_pred:
#             final_predictions[index] = mlm_pred

final_predictions[:4]

In [None]:
sample_submission['PredictionString'] = final_predictions
sample_submission.to_csv(f'submission.csv', index=False)
sample_submission.head()