# showus

In [1]:
#default_exp showus

In [2]:
! pip install /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
! pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
! pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
! pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
! pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Processing /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.8.7
    Uninstalling fsspec-0.8.7:
      Successfully uninstalled fsspec-0.8.7
Successfully installed fsspec-2021.4.0
Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.59.0
    Uninstalling tqdm-4.59.0:
      Successf

In [3]:
#export
import os, shutil
from pathlib import Path
import itertools
import re
import json
import random
import numpy as np
import pandas as pd
import torch
import transformers, seqeval
from tqdm import tqdm

In [4]:
!cp /kaggle/input/coleridge-packages/my_seqeval.py ./

# Utilities

In [5]:
#export
Path.ls = lambda pth: list(pth.iterdir())

# Data I/O

In [6]:
#export
def load_train_meta(pth, group_id=True):
    df = pd.read_csv(pth)
    if group_id:
        df = df.groupby('Id').agg({'pub_title': 'first', 'dataset_title': '|'.join, 
                                   'dataset_label': '|'.join, 'cleaned_label': '|'.join}).reset_index()
    return df

In [7]:
pth = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
df = load_train_meta(pth, group_id=True)
df_nogroup = load_train_meta(pth, group_id=False)
print(len(df), len(df_nogroup))
dup_ids = df_nogroup[df_nogroup.Id.duplicated()].Id.unique()
print(df[df.Id.isin(dup_ids)].dataset_label.values[-10:])

14316 19661
['Baltimore Longitudinal Study of Aging (BLSA)|Baltimore Longitudinal Study of Aging'
 'Beginning Postsecondary Students Longitudinal Study|Education Longitudinal Study|Beginning Postsecondary Students'
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 'Baltimore Longitudinal Study of Aging (BLSA)|Baltimore Longitudinal Study of Aging'
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 'Beginning Postsecondary Student|Beginning Postsecondary Students']


In [8]:
#export
def load_papers(dir_json, paper_ids):
    '''
    Load papers into a dictionary.
    
    `papers`: 
        {''}
    '''
    
    papers = {}
    for paper_id in paper_ids:
        with open(f'{dir_json}/{paper_id}.json', 'r') as f:
            paper = json.load(f)
            papers[paper_id] = paper
    return papers

In [9]:
df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train/', df.Id.unique()[:10])
print(type(papers))
print(
    papers[ random.choice(list(papers.keys())) ][0]
)

<class 'dict'>
{'section_title': 'Abstract', 'text': "This study examines the wage gender gap of young adults in the 1970s, 1980s, and 2000 in the US. Using quantile regression we estimate the gender gap across the entire wage distribution. We also study the importance of high school characteristics in predicting future labor market performance. We conduct analyses for three major racial/ethnic groups in the US: Whites, Blacks, and Hispanics, employing data from two rich longitudinal studies: NLS and NELS. Our results indicate that while some school characteristics are positive and significant predictors of future wages for Whites, they are less so for the two minority groups. We find significant wage gender disparities favoring men across all three surveys in the 1970s, 1980s, and 2000. The wage gender gap is more pronounced in higher paid jobs (90th quantile) for all groups, indicating the presence of a persistent and alarming ''glass ceiling.'' Ó 2007 Elsevier Inc. All rights reserv

In [10]:
#export
def load_sample_text(jpth):
    sections = json.loads(jpth.read_text())
    text = '\n'.join(section['text'] for section in sections)
    return text

In [11]:
jpths_trn = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train/').ls()
print(load_sample_text(jpths_trn[0])[:1_000])

The International Standard Classification of Education, known by its acronym ISCED, was developed by the United Nations Educational, Scientific, and Cultural Organization during the late 1960s and 1970s. ISCED was implemented in 1976 and is the recognized international standard for reporting and interpreting education program data. Creating a U.S. crosswalk to this system has been a goal of the National Center for Education Statistics and the Office of Research since the late 197,,s, when the National Institute of Education (the predecessor agency to the Office of Educational Research and Improvement) began exploring the idea. The design and implementation of a workable crosswalk, however, awaited the advent of changes to the Classification of Instructional Programs (CIP) system. The 1990 revision of the CIP system laid the foundation for a workable international crosswalk. Adoption of the National Education Goals set global consciousness and international educational comparisons firml

# Data processing

In [12]:
#export
def clean_training_text(txt, lower=False, total_clean=False):
    """
    similar to the default clean_text function but without lowercasing.
    """
    txt = str(txt).lower() if lower else str(txt)
    txt = re.sub('[^A-Za-z0-9]+', ' ', txt).strip()
    if total_clean:
        txt = re.sub(' +', ' ', txt)
    return txt

In [13]:
print(clean_training_text('@kaggle This competition awards $90,000!!!!.'))
print(clean_training_text('HoPKLd + 7 ! 11,002', total_clean=True, lower=True))

kaggle This competition awards 90 000
hopkld 7 11 002


In [14]:
#export
def shorten_sentences(sentences, max_length=64, overlap=20):
    '''
    Args:
        sentences (list): List of sentences.
        max_length (int): Maximum number of words allowed for each sentence.
        overlap (int): If a sentence exceeds `max_length`, we split it to multiple sentences with 
            this amount of overlapping.
    '''
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > max_length:
            for p in range(0, len(words), max_length - overlap):
                short_sentences.append(' '.join(words[p:p+max_length]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [15]:
jpths_trn = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train/').ls()
sentences = load_sample_text(jpths_trn[0]).split('.')[:2]
short_sentences = shorten_sentences(sentences, max_length=10, overlap=2)
print('Before:', sentences)
print()
print('After:', short_sentences)

Before: ['The International Standard Classification of Education, known by its acronym ISCED, was developed by the United Nations Educational, Scientific, and Cultural Organization during the late 1960s and 1970s', ' ISCED was implemented in 1976 and is the recognized international standard for reporting and interpreting education program data']

After: ['The International Standard Classification of Education, known by its acronym', 'its acronym ISCED, was developed by the United Nations Educational,', 'Nations Educational, Scientific, and Cultural Organization during the late 1960s', 'late 1960s and 1970s', 'ISCED was implemented in 1976 and is the recognized international', 'recognized international standard for reporting and interpreting education program data', 'program data']


In [16]:
#export
def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

In [17]:
big_list = ['If', 'the', 'thing', 'above', 'is', 'below', 'that', 'thing', 'which', 'is',
            'not', 'as', 'high', 'up', 'on', 'the', 'thing', 'above', 'when', 'it', 'is', 
            'underneath', 'them.']
small_list = ['the', 'thing', 'above']

find_sublist(big_list, small_list)

[1, 15]

# Named Entity Recognition

In [18]:
#export
def tag_sentence(sentence, labels): 
    '''
    requirement: both sentence and labels are already cleaned
    '''
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [19]:
sentence = ("The International Standard Classification of Education, known by its acronym ISCED, "
            "was developed by the United Nations Educational, "
            "Scientific, and Cultural Organization during the late 1960s and 1970s")
labels = ['The International', 'Cultural Organization', 'United Nations Educational']

sentence = clean_training_text(sentence)
labels = [clean_training_text(label) for label in labels]
found_any, token_tags = tag_sentence(sentence, labels)

print('A label is found in the sentence:', found_any)
print('(token, tag) pairs:')
print(token_tags)

A label is found in the sentence: True
(token, tag) pairs:
[('The', 'B'), ('International', 'I'), ('Standard', 'O'), ('Classification', 'O'), ('of', 'O'), ('Education', 'O'), ('known', 'O'), ('by', 'O'), ('its', 'O'), ('acronym', 'O'), ('ISCED', 'O'), ('was', 'O'), ('developed', 'O'), ('by', 'O'), ('the', 'O'), ('United', 'B'), ('Nations', 'I'), ('Educational', 'I'), ('Scientific', 'O'), ('and', 'O'), ('Cultural', 'B'), ('Organization', 'I'), ('during', 'O'), ('the', 'O'), ('late', 'O'), ('1960s', 'O'), ('and', 'O'), ('1970s', 'O')]


In [20]:
#export
def get_ner_data(papers, df=None, shuffle=True):
    '''
    Args:
        papers (dict): Like that returned by `load_papers`.
        df (pd.DataFrame): Competition's train.csv or a subset of it.
    '''
    cnt_pos, cnt_neg = 0, 0 
    ner_data = []

    tqdm._instances.clear()
    pbar = tqdm(total=len(df))
    for i, id, dataset_label in df[['Id', 'dataset_label']].itertuples():
        paper = papers[id]

        labels = dataset_label.split('|')
        labels = [clean_training_text(label) for label in labels]

        sentences = set([clean_training_text(sentence) for section in paper 
                     for sentence in section['text'].split('.')])
        sentences = shorten_sentences(sentences) 
        sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars

        # positive sample
        for sentence in sentences:
            is_positive, tags = tag_sentence(sentence, labels)
            if is_positive:
                cnt_pos += 1
                ner_data.append(tags)
            elif any(word in sentence.lower() for word in ['data', 'study']): 
                ner_data.append(tags)
                cnt_neg += 1

        pbar.update(1)
        pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")
#         print(f"\rProcessing paper {i:05d} / {len(df)}. Training data size: {cnt_pos} positives + {cnt_neg} negatives", 
#               flush=True, end='')

    if shuffle:
        random.shuffle(ner_data)
    return ner_data

In [21]:
df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv').iloc[:20]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train/', df.Id)
ner_data = get_ner_data(papers, df, shuffle=False)
print(ner_data[0])

Training data size: 54 positives + 406 negatives: 100%|██████████| 20/20 [00:00<00:00, 152.20it/s]

[('The', 'O'), ('statistical', 'O'), ('significance', 'O'), ('of', 'O'), ('the', 'O'), ('study', 'O'), ('s', 'O'), ('domain', 'O'), ('average', 'O'), ('was', 'O'), ('determined', 'O'), ('by', 'O'), ('the', 'O'), ('WWC', 'O'), ('the', 'O'), ('study', 'O'), ('is', 'O'), ('characterized', 'O'), ('as', 'O'), ('having', 'O'), ('a', 'O'), ('statistically', 'O'), ('significant', 'O'), ('positive', 'O'), ('effect', 'O'), ('because', 'O'), ('univariate', 'O'), ('statistical', 'O'), ('tests', 'O'), ('are', 'O'), ('reported', 'O'), ('for', 'O'), ('each', 'O'), ('outcome', 'O'), ('measure', 'O'), ('and', 'O'), ('both', 'O'), ('effects', 'O'), ('are', 'O'), ('positive', 'O'), ('and', 'O'), ('statistically', 'O'), ('significant', 'O'), ('accounting', 'O'), ('for', 'O'), ('multiple', 'O'), ('comparisons', 'O')]


In [22]:
#export
def write_ner_json(ner_data, pth=Path('train_ner.json')):
    with open(pth, 'w') as f:
        for row in ner_data:
            words, nes = list(zip(*row))
            row_json = {'tokens' : words, 'tags' : nes}
            json.dump(row_json, f)
            f.write('\n')    

In [23]:
ner_data = [
    [('There', 'O'), ('is', 'O'), ('no', 'O'), ('dataset', 'O'), ('here', 'O')], 
    [('Load', 'O'), ('the', 'O'), ('UN', 'B'), ('Trade', 'I'), ('Development', 'I'), ('into', 'O'), ('view', 'O')]
]
write_ner_json(ner_data, pth=Path('/kaggle/tmp_ner.json'))
! cat /kaggle/tmp_ner.json

{"tokens": ["There", "is", "no", "dataset", "here"], "tags": ["O", "O", "O", "O", "O"]}
{"tokens": ["Load", "the", "UN", "Trade", "Development", "into", "view"], "tags": ["O", "O", "B", "I", "I", "O", "O"]}


In [24]:
# from datasets import load_dataset


# datasets = load_dataset('json', data_files={'train': '/kaggle/tmp_ner.json', 
#                                             'valid': '/kaggle/tmp_ner.json'})

# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

# example = datasets['train'][0]

# tokenized_input = tokenizer(example['tokens'], is_split_into_words=True)

# tokenized_input

# print(example['tokens'])
# print(tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']))

# type(tokenizer)

# def tokenize_and_align_labels(examples, label_all_tokens=True):
#     '''
#     Adds a new field called 'labels' that are the NER tags to tokenized input.
    
#     Args:
#         tokenizer (transformers.AutoTokenizer): Tokenizer.
#         examples (datasets.arrow_dataset.Dataset): Dataset.
#         label_all_tokens (bool): If True, all sub-tokens are given the same tag as the 
#             first sub-token, otherwise all but the first sub-token are given the tag
#             -100.
#     '''
#     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
#     labels = []
#     for i, label in enumerate(examples["tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:
#             # Special tokens have a word id that is None. We set the label to -100 so they are automatically
#             # ignored in the loss function.
#             if word_idx is None:
#                 label_ids.append(-100)
#             # We set the label for the first token of each word.
#             elif word_idx != previous_word_idx:
#                 label_ids.append(label[word_idx])
#             # For the other tokens in a word, we set the label to either the current label or -100, depending on
#             # the label_all_tokens flag.
#             else:
#                 label_ids.append(label[word_idx] if label_all_tokens else -100)
#             previous_word_idx = word_idx

#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

# examples = datasets['train']
# tokenize_and_align_labels(examples, label_all_tokens=True)

# datasets.Features

# datasets['train'].features['tags']

# from functools import partial

# tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

In [25]:
def kaggle_run_ner(model_name_or_path='bert-base-cased', 
                   train_file='./train_ner.json', validation_file='./train_ner.json',
                   num_train_epochs=1, per_device_train_batch_size=8, per_device_eval_batch_size=8,
                   save_steps=15000, output_dir='./output', report_to='none', seed=123):
    !python ../input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path {model_name_or_path} \
    --train_file {train_file} \
    --validation_file {validation_file} \
    --num_train_epochs {num_train_epochs} \
    --per_device_train_batch_size {per_device_train_batch_size} \
    --per_device_eval_batch_size {per_device_eval_batch_size} \
    --save_steps {save_steps} \
    --output_dir {output_dir} \
    --report_to {report_to} \
    --seed {seed} \
    --do_train 

In [26]:
df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv').iloc[:2]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train', df.Id)
ner_data = get_ner_data(papers, df)
write_ner_json(ner_data, pth=Path('./train_ner.json'))
kaggle_run_ner(save_steps=16)

Training data size: 5 positives + 26 negatives: 100%|██████████| 2/2 [00:00<00:00, 202.50it/s]


Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-a8513ebe8fafebcc/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a8513ebe8fafebcc/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
[INFO|file_utils.py:1402] 2021-05-17 08:28:40,417 >> https://huggingface.co/bert-base-cased/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp1yu7n1ty
Downloading: 100%|██████████████████████████████| 570/570 [00:00<00:00, 484kB/s]
[INFO|file_utils.py:1406] 2021-05-17 08:28:40,689 >> storing https://huggingface.co/bert-base-cased/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/a803

# Literal matching

In [27]:
#export
def create_knowledge_bank(pth):
    '''
    Args:
        pth (str): Path to meta data like 'train.csv', which
        needs to have columns: 'dataset_title', 'dataset_label', and 'cleaned_label'.
        
    Returns:
        all_labels (set): All possible strings associated with a dataset from the meta data.
    '''
    df = load_train_meta(pth, group_id=False)
    all_labels = set()
    for label_1, label_2, label_3 in df[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
        all_labels.add(str(label_1).lower())
        all_labels.add(str(label_2).lower())
        all_labels.add(str(label_3).lower())
    return all_labels

In [28]:
pth = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
all_labels = create_knowledge_bank(pth)
print(len(all_labels))
print(sorted(all_labels)[:10])

180
['2019 ncov complete genome sequences', '2019 ncov genome sequence', '2019 ncov genome sequences', '2019-ncov complete genome sequences', '2019-ncov genome sequence', '2019-ncov genome sequences', 'adni', 'advanced national seismic system (anss) comprehensive catalog (comcat)', 'advanced national seismic system anss comprehensive catalog comcat ', 'advanced national seismic system comprehensive catalog']


In [29]:
#export
def literal_match(paper, all_labels):
    '''
    Args:
        paper ()
    '''
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = clean_training_text(text_1, lower=True, total_clean=True)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_training_text(label, lower=True, total_clean=True))
    return labels

In [30]:
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test/', sample_submission.Id)

pth = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
all_labels = create_knowledge_bank(pth)

literal_preds = []
for paper_id in sample_submission.Id:
    paper = papers[paper_id]
    literal_preds.append('|'.join(literal_match(paper, all_labels)))
    
literal_preds

['adni|alzheimer s disease neuroimaging initiative adni',
 'nces common core of data|common core of data|trends in international mathematics and science study',
 'noaa storm surge inundation|slosh model|sea lake and overland surges from hurricanes',
 'rural urban continuum codes']

# Bert model prediction

In [31]:
#export
def get_ner_inference_data(papers, sample_submission):
    '''
    Args:
        papers (dict): Each list in this dictionary consists of the section of a paper.
        sample_submission (pd.DataFrame): Competition 'sample_submission.csv'.
    Returns:
        test_rows (list): Each dict in this list is of the form: 
            {'tokens': ['goat', 'win', ...], 'tags': ['O', 'O', ...]}
            and represents a sentence.  
        paper_length (list): Number of sentences in each paper.
    '''
    test_rows = [] # test data in NER format
    paper_length = [] # store the number of sentences each paper has

    for paper_id in sample_submission['Id']:
        # load paper
        paper = papers[paper_id]

        # extract sentences
        sentences = [clean_training_text(sentence) for section in paper 
                     for sentence in section['text'].split('.')
                    ]
        sentences = shorten_sentences(sentences) # make sentences short
        sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
        sentences = [sentence for sentence in sentences if any(word in sentence.lower() for word in ['data', 'study'])]

        # collect all sentences in json
        for sentence in sentences:
            sentence_words = sentence.split()
            dummy_tags = ['O']*len(sentence_words)
            test_rows.append({'tokens' : sentence_words, 'tags' : dummy_tags})

        # track which sentence belongs to which data point
        paper_length.append(len(sentences))

    print(f'total number of sentences: {len(test_rows)}')
    return test_rows, paper_length

In [32]:
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test', sample_submission.Id)
test_rows, paper_length = get_ner_inference_data(papers, sample_submission)
print(test_rows[:3])
print(paper_length)

total number of sentences: 367
[{'tokens': ['A', 'recent', 'large', 'genomewide', 'association', 'study', 'GWAS', 'reported', 'a', 'genome', 'wide', 'significant', 'locus', 'for', 'years', 'of', 'education', 'which', 'subsequently', 'demonstrated', 'association', 'to', 'general', 'cognitive', 'ability', 'g', 'in', 'overlapping', 'cohorts'], 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'tokens': ['The', 'current', 'study', 'was', 'designed', 'to', 'test', 'whether', 'GWAS', 'hits', 'for', 'educational', 'attainment', 'are', 'involved', 'in', 'general', 'cognitive', 'ability', 'in', 'an', 'independent', 'large', 'scale', 'collection', 'of', 'cohorts'], 'tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']}, {'tokens': ['We', 'next', 'conducted', 'meta', 'analyses', 'with', '24', '189', 'in

In [33]:

def kaggle_run_ner_predict(model_name_or_path='/kaggle/input/coleridge-bert-models/output', 
                           train_file='/kaggle/input/coleridge-bert-models/train_ner.json', 
                           validation_file='/kaggle/input/coleridge-bert-models/train_ner.json', 
                           test_file='./input_data/test_ner_input.json', 
                           output_dir='./pred'):
    '''
    Args:
        test_file (Path, str): Path to json file in which each row represents an input
            sample to the model (representing a sentence in this context).  Each row
            is a dictionary of the form:
            {'tokens': ['hi', 'there', ...], 'tags': ['O', 'O', ...]}
        output_dir (Path, str): Path to the directory in which prediction results are saved.
    '''
    os.environ["MODEL_PATH"] = f"{model_name_or_path}"
    os.environ["TRAIN_FILE"] = f"{train_file}"
    os.environ["VALIDATION_FILE"] = f"{validation_file}"
    os.environ["TEST_FILE"] = f"{test_file}"
    os.environ["OUTPUT_DIR"] = f"{output_dir}"
    
    ! python /kaggle/input/kaggle-ner-utils/kaggle_run_ner.py \
    --model_name_or_path "$MODEL_PATH" \
    --validation_file "$VALIDATION_FILE" \
    --train_file "$TRAIN_FILE" \
    --test_file "$TEST_FILE" \
    --output_dir "$OUTPUT_DIR" \
    --report_to 'none' \
    --seed 123 \
    --do_predict

def run_inference(test_rows, predict_batch=64_000, 
                  model_name_or_path='/kaggle/input/coleridge-bert-models/output', 
                  train_file='/kaggle/input/coleridge-bert-models/train_ner.json', 
                  validation_file='/kaggle/input/coleridge-bert-models/train_ner.json', 
                  test_file='./input_data/test_ner_input.json', 
                  output_dir='./pred'):
    '''
    '''
    test_file = Path(test_file)
    test_file.parent.mkdir(exist_ok=True, parents=True)
    
    bert_outputs = []
    for batch_begin in range(0, len(test_rows), predict_batch):
        # write data rows to input file
        with open(test_file, 'w') as f:
            for row in test_rows[batch_begin:batch_begin + predict_batch]:
                json.dump(row, f)
                f.write('\n')

        # remove output dir
        if os.path.exists(output_dir):
            shutil.rmtree(output_dir)

        # do predict
        kaggle_run_ner_predict(
            model_name_or_path=model_name_or_path, 
            train_file=train_file, validation_file=validation_file, test_file=test_file, 
            output_dir=output_dir)

        # read predictions
        with open(f'{output_dir}/test_predictions.txt') as f:
            this_preds = f.read().split('\n')[:-1]
            bert_outputs += [pred.split() for pred in this_preds]
    return bert_outputs

In [34]:
predict_batch = 64_000 

model_name_or_path = '/kaggle/working/output/' #'/kaggle/input/coleridge-bert-models/output'
test_file = './input_data/test_ner_input.json'
train_file = 'train_ner.json' #'/kaggle/input/coleridge-bert-models/train_ner.json'
validation_file = 'train_ner.json' #'/kaggle/input/coleridge-bert-models/train_ner.json'
output_dir = './pred'

bert_outputs = run_inference(test_rows, predict_batch=predict_batch, 
                             model_name_or_path=model_name_or_path, 
                             test_file=test_file, train_file=train_file, validation_file=validation_file,
                             output_dir=output_dir)

Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-6a3f3cd2a23edb39/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...
Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-6a3f3cd2a23edb39/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
[INFO|configuration_utils.py:470] 2021-05-17 08:29:24,935 >> loading configuration file /kaggle/working/output/config.json
[INFO|configuration_utils.py:508] 2021-05-17 08:29:24,935 >> Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": "ner",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768

In [35]:
! ls {output_dir}/test_predictions.txt

./pred/test_predictions.txt


In [36]:
#export
def get_bert_dataset_labels(test_rows, paper_length, bert_outputs):
    '''
    Returns:
        bert_dataset_labels (list): Each element is a set consisting of labels predicted
            by the model.
    '''
    test_sentences = [row['tokens'] for row in test_rows]
    
    bert_dataset_labels = [] # store all dataset labels for each publication

    for length in paper_length:
        labels = set()
        for sentence, pred in zip(test_sentences[:length], bert_outputs[:length]):
            curr_phrase = ''
            for word, tag in zip(sentence, pred):
                if tag == 'B': # start a new phrase
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
                    curr_phrase = word
                elif tag == 'I' and curr_phrase: # continue the phrase
                    curr_phrase += ' ' + word
                else: # end last phrase (if any)
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
            # check if the label is the suffix of the sentence
            if curr_phrase:
                labels.add(curr_phrase)
                curr_phrase = ''

        # record dataset labels for this publication
        bert_dataset_labels.append(labels)

        del test_sentences[:length], bert_outputs[:length]
        
    return bert_dataset_labels

In [37]:
sentences = ['They do not present all the features', 
             'Despite the pretraining on the Tigers EcoNAX dataset',
             'Weirdly there has been lots of studies based on WGS Equality Definitiveness Dataset']
paper_length = [2, 1]
test_rows = [{'tokens': sentence.split(), 'tags': len(sentence.split()) * ['O']} 
             for sentence in sentences]
bert_outputs = [['O', 'O', 'O', 'B', 'I', 'I', 'O'],
                ['O', 'O', 'O', 'O', 'O', 'B', 'I', 'I'],
                ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I']]

for i, row in enumerate(test_rows):
    assert len(row['tokens']) == len(row['tags']) == len(bert_outputs[i])

bert_dataset_labels = get_bert_dataset_labels(test_rows, paper_length, bert_outputs)

In [38]:
bert_dataset_labels

[{'Tigers EcoNAX dataset', 'present all the'},
 {'WGS Equality Definitiveness Dataset'}]

In [39]:
#export
def jaccard_similarity(s1, s2):
    l1 = set(s1.split(" "))
    l2 = set(s2.split(" "))
    intersection = len(list(l1.intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

In [40]:
jaccard_similarity('USGS Frog Counts Data', 'USGA Croc Counts Data') == 1 / 3

True

In [41]:
#export
def filter_bert_labels(bert_dataset_labels):
    '''
    When several labels for a paper are too similar, keep just one of them.
    '''
    filtered_bert_labels = []

    for labels in bert_dataset_labels:
        filtered = []

        for label in sorted(labels, key=len):
            label = clean_training_text(label, lower=True)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
                filtered.append(label)

        filtered_bert_labels.append('|'.join(filtered))
    return filtered_bert_labels

In [42]:
bert_dataset_labels = [{'moma artists catalogue', 'moma artists', 'housing market'},
                       {'rhs flowers fertiliser index', 'deep sea rock salts', 'rhs fertiliser index'}]

filter_bert_labels(bert_dataset_labels)

['moma artists|housing market|moma artists catalogue',
 'deep sea rock salts|rhs fertiliser index']

# Overall prediction for submission

In [43]:
#export
def combine_matching_and_bert(literal_preds, filtererd_bert_labels):
    final_predictions = []
    for literal_match, bert_pred in zip(literal_preds, filtered_bert_labels):
        if literal_match:
            final_predictions.append(literal_match)
        else:
            final_predictions.append(bert_pred)
    return final_predictions

In [44]:
literal_preds = ['mongolian racing cars|reallife headphones', '']
filtered_bert_labels = ['data|dataset', 'hifi dataset|headphones collection data']
combine_matching_and_bert(literal_preds, filtered_bert_labels)

['mongolian racing cars|reallife headphones',
 'hifi dataset|headphones collection data']

# Reference
- https://www.kaggle.com/tungmphung/pytorch-bert-for-named-entity-recognition/notebook
- https://www.kaggle.com/tungmphung/coleridge-matching-bert-ner/notebook
- https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb