# showus

In [None]:
#default_exp showus

In [None]:
! pip install /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
! pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
! pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
! pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
! pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Processing /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.8.7
    Uninstalling fsspec-0.8.7:
      Successfully uninstalled fsspec-0.8.7
Successfully installed fsspec-2021.4.0
Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.59.0
    Uninstalling tqdm-4.59.0:
      Successf

In [None]:
#export
import os, shutil
from tqdm import tqdm
from pathlib import Path
import itertools
from functools import partial
import re
import json
import random
import numpy as np
import pandas as pd
import torch
import transformers, seqeval
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, ClassLabel, load_metric

# Utilities

In [None]:
#export
Path.ls = lambda pth: list(pth.iterdir())

# Data I/O

In [None]:
#export
def load_train_meta(pth, group_id=True):
    df = pd.read_csv(pth)
    if group_id:
        df = df.groupby('Id').agg({'pub_title': 'first', 'dataset_title': '|'.join, 
                                   'dataset_label': '|'.join, 'cleaned_label': '|'.join}).reset_index()
    return df

In [None]:
pth = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
df = load_train_meta(pth, group_id=True)
df_nogroup = load_train_meta(pth, group_id=False)
print(len(df), len(df_nogroup))
dup_ids = df_nogroup[df_nogroup.Id.duplicated()].Id.unique()
print(df[df.Id.isin(dup_ids)].dataset_label.values[-10:])

14316 19661
['Baltimore Longitudinal Study of Aging (BLSA)|Baltimore Longitudinal Study of Aging'
 'Beginning Postsecondary Students Longitudinal Study|Education Longitudinal Study|Beginning Postsecondary Students'
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 'Baltimore Longitudinal Study of Aging (BLSA)|Baltimore Longitudinal Study of Aging'
 "ADNI|Alzheimer's Disease Neuroimaging Initiative (ADNI)"
 'Beginning Postsecondary Student|Beginning Postsecondary Students']


In [None]:
#export
def load_papers(dir_json, paper_ids):
    '''
    Load papers into a dictionary.
    
    `papers`: 
        {''}
    '''
    
    papers = {}
    for paper_id in paper_ids:
        with open(f'{dir_json}/{paper_id}.json', 'r') as f:
            paper = json.load(f)
            papers[paper_id] = paper
    return papers

In [None]:
df = load_train_meta('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv', group_id=True).iloc[-10:]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train/', df.Id)
print(type(papers))
print(
    papers[ np.random.choice(df.Id.values) ][0]
)

<class 'dict'>
{'section_title': 'Abstract', 'text': "Water-quality data for nitrate, fecalindicator bacteria, pesticides, and volatile organic compounds collected in parts of Middle Tennessee and northern Alabama indicate that the Mississippian carbonate aquifer in these areas is susceptible to contamination from point and nonpoint sources. Thirty randomly located wells (predominantly domestic), two springs, and two additional publicsupply wells were sampled in the summer of 1999 as part of the U.S. Geological Survey's National Water-Quality Assessment (NAWQA) Program."}


In [None]:
#export
def load_sample_text(jpth):
    sections = json.loads(jpth.read_text())
    text = '\n'.join(section['text'] for section in sections)
    return text

In [None]:
jpths_trn = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train/').ls()
print(load_sample_text(jpths_trn[0])[:1_000])

The International Standard Classification of Education, known by its acronym ISCED, was developed by the United Nations Educational, Scientific, and Cultural Organization during the late 1960s and 1970s. ISCED was implemented in 1976 and is the recognized international standard for reporting and interpreting education program data. Creating a U.S. crosswalk to this system has been a goal of the National Center for Education Statistics and the Office of Research since the late 197,,s, when the National Institute of Education (the predecessor agency to the Office of Educational Research and Improvement) began exploring the idea. The design and implementation of a workable crosswalk, however, awaited the advent of changes to the Classification of Instructional Programs (CIP) system. The 1990 revision of the CIP system laid the foundation for a workable international crosswalk. Adoption of the National Education Goals set global consciousness and international educational comparisons firml

# Data processing

In [None]:
#export
def clean_training_text(txt, lower=False, total_clean=False):
    """
    similar to the default clean_text function but without lowercasing.
    """
    txt = str(txt).lower() if lower else str(txt)
    txt = re.sub('[^A-Za-z0-9]+', ' ', txt).strip()
    if total_clean:
        txt = re.sub(' +', ' ', txt)
    return txt

In [None]:
print(clean_training_text('@kaggle This competition awards $90,000!!!!.'))
print(clean_training_text('HoPKLd + 7 ! 11,002', total_clean=True, lower=True))

kaggle This competition awards 90 000
hopkld 7 11 002


In [None]:
#export
def shorten_sentences(sentences, max_length=64, overlap=20):
    '''
    Args:
        sentences (list): List of sentences.
        max_length (int): Maximum number of words allowed for each sentence.
        overlap (int): If a sentence exceeds `max_length`, we split it to multiple sentences with 
            this amount of overlapping.
    '''
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > max_length:
            for p in range(0, len(words), max_length - overlap):
                short_sentences.append(' '.join(words[p:p+max_length]))
        else:
            short_sentences.append(sentence)
    return short_sentences

In [None]:
jpths_trn = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train/').ls()
sentences = load_sample_text(jpths_trn[0]).split('.')[:2]
short_sentences = shorten_sentences(sentences, max_length=10, overlap=2)
print('Before:', sentences)
print()
print('After:', short_sentences)

Before: ['The International Standard Classification of Education, known by its acronym ISCED, was developed by the United Nations Educational, Scientific, and Cultural Organization during the late 1960s and 1970s', ' ISCED was implemented in 1976 and is the recognized international standard for reporting and interpreting education program data']

After: ['The International Standard Classification of Education, known by its acronym', 'its acronym ISCED, was developed by the United Nations Educational,', 'Nations Educational, Scientific, and Cultural Organization during the late 1960s', 'late 1960s and 1970s', 'ISCED was implemented in 1976 and is the recognized international', 'recognized international standard for reporting and interpreting education program data', 'program data']


In [None]:
#export
def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

In [None]:
big_list = ['If', 'the', 'thing', 'above', 'is', 'below', 'that', 'thing', 'which', 'is',
            'not', 'as', 'high', 'up', 'on', 'the', 'thing', 'above', 'when', 'it', 'is', 
            'underneath', 'them.']
small_list = ['the', 'thing', 'above']

find_sublist(big_list, small_list)

[1, 15]

# Named Entity Recognition

In [None]:
#export
def get_ner_classlabel():
    '''
    Labels for named entity recognition.
        'O': Token not part of a phrase that mentions a dataset.
        'I': Intermediate token of a phrase mentioning a dataset.
        'B': First token of a phrase mentioning a dataset.
    '''
    return ClassLabel(names=['O', 'I', 'B'])

In [None]:
classlabel = get_ner_classlabel()
print(classlabel)
print(classlabel.str2int(['I', 'O', 'B']), classlabel.str2int('I'))
print(classlabel.int2str(2), classlabel.int2str([2, 1, 0]))

ClassLabel(num_classes=3, names=['O', 'I', 'B'], names_file=None, id=None)
[1, 0, 2] 1
B ['B', 'I', 'O']


In [None]:
#export
def tag_sentence(sentence, labels, classlabel=None): 
    '''
    requirement: both sentence and labels are already cleaned
    '''
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = [classlabel.str2int('O')] * len(sentence_words)
        for label in labels:
            label_words = label.split()

            all_pos = find_sublist(sentence_words, label_words)
            for pos in all_pos:
                nes[pos] = classlabel.str2int('B')
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = classlabel.str2int('I')

        return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = [classlabel.str2int('O')] * len(sentence_words)
        return False, list(zip(sentence_words, nes))

In [None]:
sentence = ("The International Standard Classification of Education, known by its acronym ISCED, "
            "was developed by the United Nations Educational, "
            "Scientific, and Cultural Organization during the late 1960s and 1970s")
labels = ['The International', 'Cultural Organization', 'United Nations Educational']

sentence = clean_training_text(sentence)
labels = [clean_training_text(label) for label in labels]
classlabel = get_ner_classlabel()
found_any, token_tags = tag_sentence(sentence, labels, classlabel=classlabel)

print('A label is found in the sentence:', found_any)
print('(token, tag) pairs:')
print(token_tags)

A label is found in the sentence: True
(token, tag) pairs:
[('The', 2), ('International', 1), ('Standard', 0), ('Classification', 0), ('of', 0), ('Education', 0), ('known', 0), ('by', 0), ('its', 0), ('acronym', 0), ('ISCED', 0), ('was', 0), ('developed', 0), ('by', 0), ('the', 0), ('United', 2), ('Nations', 1), ('Educational', 1), ('Scientific', 0), ('and', 0), ('Cultural', 2), ('Organization', 1), ('during', 0), ('the', 0), ('late', 0), ('1960s', 0), ('and', 0), ('1970s', 0)]


In [None]:
#export
def extract_sentences(paper, sentence_definition='sentence'):
    if sentence_definition == 'sentence':
        sentences = set(clean_training_text(sentence) 
                        for sec in paper for sentence in sec['text'].split('.') if sec['text'])
    elif sentence_definition == 'section':
        sentences = set(clean_training_text(sec['section_title'] + '\n' + sec['text']) 
                        for sec in paper if sec['text'])
    return sentences

In [None]:
df = load_train_meta('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv', group_id=True).iloc[100:110]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train', df.Id)
paper = papers[df.Id.iloc[3]]
print('..... Sentence definition = normal sentence')
sentences = extract_sentences(paper, sentence_definition='sentence')
print(len(sentences), list(sentences)[:2], end='\n\n')
print('..... Sentence definition = paper section')
sentences = extract_sentences(paper, sentence_definition='section')
print(len(sentences), list(sentences)[:2][:1_000], end='\n\n')

..... Sentence definition = normal sentence
302 ['', '7 percent so ignoring statistical significance halving the distance to the nearest charter would bring about an increase of just less than ten percent of the average achievement gain']

..... Sentence definition = paper section
12 ['IV panel models This elasticity might seem small at first blush but recall that it is conditional on the previous year performance composite The average performance composite gain in 1999 2000 for example is 1 11 points 1 7 percent so ignoring statistical significance halving the distance to the nearest charter would bring about an increase of just less than ten percent of the average achievement gain Models 2 through 6 control for charter school competition using indicators for whether a charter school was operating within a given distance In all of these models charter school competition raises the performance composite of the traditional school The effect is significant at standard significance levels

In [None]:
#export
def get_paper_ner_data(paper, labels, classlabel=None,
                       sentence_definition='sentence', max_length=64, overlap=20):
    '''
    Get NER data for a single paper.
    '''
    labels = [clean_training_text(label) for label in labels]
    sentences = extract_sentences(paper, sentence_definition=sentence_definition)
    sentences = shorten_sentences(sentences, max_length=max_length, overlap=overlap) 
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars

    cnt_pos, cnt_neg, ner_data = 0, 0, []
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels, classlabel=classlabel)
        if is_positive:
            cnt_pos += 1
            ner_data.append(tags)
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1    
    return cnt_pos, cnt_neg, ner_data

In [None]:
df = load_train_meta('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv', group_id=True).iloc[230:240]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train/', df.Id)
classlabel = get_ner_classlabel()

idx = 8
paper = papers[df.Id.iloc[idx]]
labels = df.dataset_label.iloc[idx].split('|')
cnt_pos, cnt_neg, ner_data = get_paper_ner_data(paper, labels, classlabel=classlabel,
                                                sentence_definition='section', max_length=512, overlap=20)
print(cnt_pos, cnt_neg)
print([len(sec) for sec in ner_data])
print(ner_data[-2])

9 1
[512, 512, 512, 512, 259, 512, 512, 49, 101, 303]
[('AUTHOR', 0), ('CONTRIBUTIONS', 0), ('Dr', 0), ('Shulman', 0), ('contributed', 0), ('to', 0), ('study', 0), ('concept', 0), ('and', 0), ('design', 0), ('acquisition', 0), ('of', 0), ('data', 0), ('analysis', 0), ('and', 0), ('interpretation', 0), ('critical', 0), ('revision', 0), ('of', 0), ('the', 0), ('manuscript', 0), ('for', 0), ('important', 0), ('intellectual', 0), ('content', 0), ('and', 0), ('study', 0), ('supervision', 0), ('Ms', 0), ('Harkins', 0), ('contributed', 0), ('to', 0), ('study', 0), ('concept', 0), ('and', 0), ('design', 0), ('acquisition', 0), ('of', 0), ('data', 0), ('analysis', 0), ('and', 0), ('interpretation', 0), ('and', 0), ('critical', 0), ('revision', 0), ('of', 0), ('the', 0), ('manuscript', 0), ('for', 0), ('important', 0), ('intellectual', 0), ('content', 0), ('Dr', 0), ('Green', 0), ('contributed', 0), ('to', 0), ('study', 0), ('concept', 0), ('and', 0), ('design', 0), ('analysis', 0), ('and', 0), 

In [None]:
#export
def get_ner_data(papers, df=None, classlabel=None, shuffle=True, 
                 sentence_definition='sentence', max_length=64, overlap=20):
    '''
    Get NER data for a list of papers.
    
    Args:
        papers (dict): Like that returned by `load_papers`.
        df (pd.DataFrame): Competition's train.csv or a subset of it.
    Returns:
        cnt_pos (int): Number of samples (or 'sentences') that are tagged or partly
            tagged as datasets.
        cnt_neg (int): Number of samples (or 'sentences') that are not tagged
            or partly tagged as datasets.
        ner_data (list): List of samples, or 'sentences'. Each element is of the form:
            [('There', 0), ('has', 0), ('been', 0), ...]
    '''
    cnt_pos, cnt_neg = 0, 0 
    ner_data = []

    tqdm._instances.clear()
    pbar = tqdm(total=len(df))
    for i, id, dataset_label in df[['Id', 'dataset_label']].itertuples():
        paper = papers[id]
        labels = dataset_label.split('|')
                
        cnt_pos_, cnt_neg_, ner_data_ = get_paper_ner_data(
            paper, labels, classlabel=classlabel, 
            sentence_definition=sentence_definition, max_length=max_length, overlap=overlap)
        cnt_pos += cnt_pos_
        cnt_neg += cnt_neg_
        ner_data.extend(ner_data_)

        pbar.update(1)
        pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

    if shuffle:
        random.shuffle(ner_data)
    return cnt_pos, cnt_neg, ner_data

In [None]:
%%time
df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv').iloc[:10]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train/', df.Id)
classlabel = get_ner_classlabel()
cnt_pos, cnt_neg, ner_data = get_ner_data(papers, df, classlabel=classlabel, shuffle=False,
                                          sentence_definition='sentence', max_length=64, overlap=20)
print(f'Postive count: {cnt_pos}.   Negative count: {cnt_neg}')
print(ner_data[250])

Training data size: 29 positives + 223 negatives: 100%|██████████| 10/10 [00:00<00:00, 158.60it/s]

Postive count: 29.   Negative count: 223
[('Likewise', 0), ('there', 0), ('is', 0), ('a', 0), ('familiarity', 0), ('with', 0), ('a', 0), ('variety', 0), ('of', 0), ('studies', 0), ('that', 0), ('provide', 0), ('specific', 0), ('data', 0), ('on', 0), ('the', 0), ('academic', 0), ('achievement', 0), ('of', 0), ('Catholic', 0), ('school', 0), ('students', 0)]
CPU times: user 125 ms, sys: 5.07 ms, total: 130 ms
Wall time: 196 ms


In [None]:
%%time
df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv').iloc[:10]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train/', df.Id)
classlabel = get_ner_classlabel()
cnt_pos, cnt_neg, ner_data = get_ner_data(papers, df, classlabel=classlabel, shuffle=False, 
                                          sentence_definition='section', max_length=512, overlap=20)
print(f'Postive count: {cnt_pos}.   Negative count: {cnt_neg}')
print(ner_data[3])

Training data size: 27 positives + 70 negatives: 100%|██████████| 10/10 [00:00<00:00, 194.50it/s]

Postive count: 27.   Negative count: 70
[('WWC', 0), ('Rating', 0), ('The', 0), ('research', 0), ('described', 0), ('in', 0), ('this', 0), ('report', 0), ('meets', 0), ('WWC', 0), ('evidence', 0), ('standards', 0), ('with', 0), ('reservations', 0), ('Cautions', 0), ('Although', 0), ('the', 0), ('study', 0), ('matched', 0), ('students', 0), ('who', 0), ('participated', 0), ('in', 0), ('dual', 0), ('enrollment', 0), ('programs', 0), ('to', 0), ('those', 0), ('who', 0), ('did', 0), ('not', 0), ('students', 0), ('who', 0), ('self', 0), ('selected', 0), ('to', 0), ('participate', 0), ('in', 0), ('dual', 0), ('enrollment', 0), ('programs', 0), ('may', 0), ('have', 0), ('been', 0), ('different', 0), ('from', 0), ('students', 0), ('in', 0), ('general', 0), ('high', 0), ('school', 0), ('programs', 0), ('in', 0), ('ways', 0), ('that', 0), ('were', 0), ('unobserved', 0), ('in', 0), ('the', 0), ('study', 0), ('data', 0), ('Study', 0), ('sample', 0), ('A', 0), ('nationally', 0), ('representative', 

In [None]:
#export
def write_ner_json(ner_data, pth=Path('train_ner.json')):
    '''
    Save NER data to json file.
    '''
    with open(pth, 'w') as f:
        for row in ner_data:
            words, nes = list(zip(*row))
            row_json = {'tokens' : words, 'ner_tags' : nes}
            json.dump(row_json, f)
            f.write('\n')    

In [None]:
ner_data = [
    [('There', 0), ('is', 0), ('no', 0), ('dataset', 0), ('here', 0)], 
    [('Load', 0), ('the', 0), ('UN', 2), ('Trade', 1), ('Development', 1), ('into', 0), ('view', 0)]
]
write_ner_json(ner_data, pth=Path('/kaggle/tmp_ner.json'))
! cat /kaggle/tmp_ner.json

{"tokens": ["There", "is", "no", "dataset", "here"], "ner_tags": [0, 0, 0, 0, 0]}
{"tokens": ["Load", "the", "UN", "Trade", "Development", "into", "view"], "ner_tags": [0, 0, 2, 1, 1, 0, 0]}


In [None]:
#export
def load_ner_datasets(data_files=None):
    '''
    Load NER data in json files to a `datasets` object.  In addition,
    Append the NER ClassLabel for the `ner_tags` feature.
    '''
    datasets = load_dataset('json', data_files=data_files)
    classlabel = get_ner_classlabel()
    for split, dataset in datasets.items():
        dataset.features['ner_tags'].feature = classlabel
    return datasets

In [None]:
datasets = load_ner_datasets(data_files={'train':'/kaggle/tmp_ner.json', 'valid':'/kaggle/tmp_ner.json'})
print()
print(datasets['valid'].features)
print(datasets['train'][1])

Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-6d5cd0925189c967/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-6d5cd0925189c967/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(num_classes=3, names=['O', 'I', 'B'], names_file=None, id=None), length=-1, id=None)}
{'tokens': ['Load', 'the', 'UN', 'Trade', 'Development', 'into', 'view'], 'ner_tags': [0, 0, 2, 1, 1, 0, 0]}


In [None]:
#export
def create_tokenizer(model_checkpoint='distilbert-base-cased'):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
    return tokenizer

In [None]:
tokenizer = create_tokenizer(model_checkpoint='distilbert-base-cased')
print(
    tokenizer("A smattering of people descended from the midday boat on Monday."))
print()
print(
    tokenizer("Giglio boasts several pristine bays with crystal clear water".split(), is_split_into_words=True)
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=411.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…


{'input_ids': [101, 138, 188, 21943, 9930, 1104, 1234, 9026, 1121, 1103, 2286, 6194, 3499, 1113, 6356, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

{'input_ids': [101, 144, 6512, 9436, 24372, 1317, 185, 12937, 2042, 15520, 1114, 8626, 2330, 1447, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
sentence = "A smattering of people descended from the midday boat on Monday.".split()
tokenized_sentence = tokenizer(sentence, is_split_into_words=True)

print(sentence)
print(tokenized_sentence.word_ids())
print(tokenizer.convert_ids_to_tokens(tokenized_sentence['input_ids']))

['A', 'smattering', 'of', 'people', 'descended', 'from', 'the', 'midday', 'boat', 'on', 'Monday.']
[None, 0, 1, 1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 10, 10, None]
['[CLS]', 'A', 's', '##mat', '##tering', 'of', 'people', 'descended', 'from', 'the', 'mid', '##day', 'boat', 'on', 'Monday', '.', '[SEP]']


In [None]:
#export
def tokenize_and_align_labels(examples, tokenizer=None, label_all_tokens=True):
    '''
    Adds a new field called 'labels' that are the NER tags to the tokenized input.
    
    Args:
        tokenizer (transformers.AutoTokenizer): Tokenizer.
        examples (datasets.arrow_dataset.Dataset): Dataset.
        label_all_tokens (bool): If True, all sub-tokens are given the same tag as the 
            first sub-token, otherwise all but the first sub-token are given the tag
            -100.
    '''
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    word_ids_all = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
        word_ids_all.append(word_ids)

    tokenized_inputs["labels"] = labels
    tokenized_inputs['word_ids'] = word_ids_all
    return tokenized_inputs

In [None]:
datasets = load_ner_datasets(data_files={'train':'/kaggle/tmp_ner.json', 'valid':'/kaggle/tmp_ner.json'})
tokenizer = create_tokenizer(model_checkpoint='bert-base-cased')

! cat /kaggle/tmp_ner.json

print()
print(tokenize_and_align_labels(datasets['train'][:], tokenizer, label_all_tokens=True), end='\n\n')

tokenized_datasets = datasets.map(
    partial(tokenize_and_align_labels, tokenizer=tokenizer, label_all_tokens=True), batched=True)
print(tokenized_datasets['valid'][:])
print(tokenized_datasets['train'].features)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…


{"tokens": ["There", "is", "no", "dataset", "here"], "ner_tags": [0, 0, 0, 0, 0]}
{"tokens": ["Load", "the", "UN", "Trade", "Development", "into", "view"], "ner_tags": [0, 0, 2, 1, 1, 0, 0]}

{'input_ids': [[101, 1247, 1110, 1185, 2233, 9388, 1303, 102], [101, 10605, 3556, 1103, 7414, 5820, 3273, 1154, 2458, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 2, 1, 1, 0, 0, -100]], 'word_ids': [[None, 0, 1, 2, 3, 3, 4, None], [None, 0, 0, 1, 2, 3, 4, 5, 6, None]]}



HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'input_ids': [[101, 1247, 1110, 1185, 2233, 9388, 1303, 102], [101, 10605, 3556, 1103, 7414, 5820, 3273, 1154, 2458, 102]], 'labels': [[-100, 0, 0, 0, 0, 0, 0, -100], [-100, 0, 0, 0, 2, 1, 1, 0, 0, -100]], 'ner_tags': [[0, 0, 0, 0, 0], [0, 0, 2, 1, 1, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'tokens': [['There', 'is', 'no', 'dataset', 'here'], ['Load', 'the', 'UN', 'Trade', 'Development', 'into', 'view']], 'word_ids': [[None, 0, 1, 2, 3, 3, 4, None], [None, 0, 0, 1, 2, 3, 4, 5, 6, None]]}
{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'ner_tags': Sequence(feature=ClassLabel(num_classes=3, names=['O', 'I', 'B'], names_file=None, id=None), length=-1, id=None), 'token_

In [None]:
metric = load_metric('seqeval')

predictions = np.array([['O', 'O', 'B', 'I', 'I', 'O']])
references = [['O', 'O', 'B', 'I', 'I', 'O']]
print(metric.compute(predictions=predictions, references=references))

predictions = [['O', 'O', 'B', 'I', 'I', 'O']]
references = [['B', 'I', 'I', 'O', 'O', 'O']]
print(metric.compute(predictions=predictions, references=references))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1961.0, style=ProgressStyle(description…


{'_': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1}, 'overall_precision': 1.0, 'overall_recall': 1.0, 'overall_f1': 1.0, 'overall_accuracy': 1.0}
{'_': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 1}, 'overall_precision': 0.0, 'overall_recall': 0.0, 'overall_f1': 0.0, 'overall_accuracy': 0.16666666666666666}


In [None]:
#export
def jaccard_similarity(s1, s2):
    l1 = set(s1.split(" "))
    l2 = set(s2.split(" "))
    intersection = len(list(l1.intersection(l2)))
    union = (len(l1) + len(l2)) - intersection
    return float(intersection) / union

In [None]:
jaccard_similarity('USGS Frog Counts Data', 'USGA Croc Counts Data') == 1 / 3

True

In [None]:
#export
def remove_nonoriginal_outputs(outputs, word_ids):
    '''
    Remove elements that correspond to special tokens or subtokens,
    retaining only those elements that correspond to a word in original
    text.
    
    Args:
        outputs (np.array): 
        
    Returns:
        outputs (list)
    '''
    assert len(outputs) == len(word_ids)
    idxs = [[word_id.index(i) for i in set(word_id) if i is not None] 
            for word_id in word_ids]
    outputs = [output[idx].tolist() for output, idx in zip(outputs, idxs)]
    for output in outputs:
        assert -100 not in output
    return outputs

In [None]:
classlabel = get_ner_classlabel()

predictions = np.random.randn(2, 8, classlabel.num_classes)
predictions = np.argmax(predictions, axis=2)

label_ids = np.array([[-100, 0, 0,    2, 1,    2,    1, -100],
                      [-100, 2, 1, -100, 0, -100, -100, -100]])

word_ids = [[None, 0, 0, 1, 2, None],
            [None, 0, 1, 1, 2, 2, None]]

true_predictions = remove_nonoriginal_outputs(predictions, word_ids)
true_label_ids   = remove_nonoriginal_outputs(label_ids,   word_ids)

print('predictions')
print('BEFORE:')
print(predictions)
print('AFTER:')
print(true_predictions)
print(60 * '=')
print('label_ids')
print('BEFORE:')
print(label_ids)
print('AFTER:')
print(true_label_ids)

predictions
BEFORE:
[[2 1 2 0 2 0 2 2]
 [0 1 2 1 2 0 0 0]]
AFTER:
[[1, 0, 2], [1, 2, 2]]
label_ids
BEFORE:
[[-100    0    0    2    1    2    1 -100]
 [-100    2    1 -100    0 -100 -100 -100]]
AFTER:
[[0, 2, 1], [2, 1, 0]]


In [None]:
#export
def compute_metrics(p, metric=None, word_ids=None, label_list=None):
    '''
    1. Remove predicted and ground-truth class ids of special and sub tokens.
    2. Convert class ids to class labels. (int ---> str)
    3. Compute metric.
    
    Args:
        p (tuple): 2-tuple consisting of model prediction and ground-truth
            labels.  These will contain elements corresponding to special 
            tokens and sub-tokens.
        word_ids (list): Word IDs from the tokenizer's output, indicating
            which original word each sub-token belongs to.
    '''
    predictions, label_ids = p
    predictions = predictions.argmax(axis=2)

    true_predictions = remove_nonoriginal_outputs(predictions, word_ids)
    true_label_ids = remove_nonoriginal_outputs(label_ids, word_ids)
#     true_predictions = [[p for p, l, in zip(pred, label) if l != -100] 
#                         for pred, label in zip(predictions, label_ids)]
#     true_label_ids   = [[l for l in label if l != -100] for label in label_ids]
    
    true_predictions = [[label_list[p] for p in pred] for pred in true_predictions]
    true_labels = [[label_list[i] for i in label_id] for label_id in true_label_ids]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
batch_size = 2
max_example_length = 6

predictions = np.random.randn(batch_size, max_example_length, classlabel.num_classes)
label_ids = np.random.randint(low=0, high=classlabel.num_classes, 
                              size=(batch_size, max_example_length), dtype=np.int16)
word_ids = [[None, 0, 0, 1, 2, None], 
            [None, 0, 1, None]]

print(predictions.argmax(axis=2))
print(label_ids)
p = (predictions, label_ids)
metric = load_metric('seqeval')
compute_metrics(p, metric=metric, label_list=classlabel.names, word_ids=word_ids)

[[1 2 1 0 0 2]
 [1 1 1 0 2 2]]
[[2 2 1 0 1 1]
 [0 2 2 1 0 1]]


{'precision': 0.5, 'recall': 0.25, 'f1': 0.3333333333333333, 'accuracy': 0.4}

## NER training

In [None]:
train_meta = load_train_meta('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv').iloc[:3]
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/train', train_meta.Id)

valid_cutoff = int(.50 * len(train_meta))
valid_meta = train_meta.iloc[:valid_cutoff].reset_index(drop=True)
train_meta = train_meta.iloc[valid_cutoff:].reset_index(drop=True)

classlabel = get_ner_classlabel()
train_cnt_pos, train_cnt_neg, train_ner_data = get_ner_data(
    papers, df=train_meta, classlabel=classlabel, sentence_definition='section', max_length=300, overlap=100)
valid_cnt_pos, valid_cnt_neg, valid_ner_data = get_ner_data(
    papers, df=valid_meta, classlabel=classlabel, sentence_definition='section', max_length=300, overlap=100)
print(f'Train.  Positive count: {train_cnt_pos}.  Negative count: {train_cnt_neg}.')
print(f'Valid.  Positive count: {valid_cnt_pos}.  Negative count: {valid_cnt_neg}.')

write_ner_json(train_ner_data, pth='train_ner.json')
write_ner_json(valid_ner_data, pth='valid_ner.json')

Training data size: 1 positives + 12 negatives: 100%|██████████| 1/1 [00:00<00:00, 153.67it/s]

Train.  Positive count: 3.  Negative count: 37.
Valid.  Positive count: 1.  Negative count: 12.


In [None]:
datasets = load_ner_datasets(data_files={'train':'train_ner.json', 'valid':'valid_ner.json'})

model_checkpoint = 'distilbert-base-cased'
tokenizer = create_tokenizer(model_checkpoint)
tokenized_datasets = datasets.map(
    partial(tokenize_and_align_labels, tokenizer=tokenizer, label_all_tokens=True), batched=True)
data_collator = DataCollatorForTokenClassification(tokenizer)

model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=classlabel.num_classes)

metric = load_metric('seqeval')
word_ids = tokenized_datasets['valid']['word_ids']
compute_metrics_ = partial(compute_metrics, metric=metric, label_list=classlabel.names, word_ids=word_ids)

args = TrainingArguments(output_dir='test_training', num_train_epochs=2, 
                         learning_rate=2e-5, weight_decay=0.01,
                         per_device_train_batch_size=16, per_device_eval_batch_size=16,
                         evaluation_strategy='epoch', logging_steps=4, report_to='none', 
                         save_strategy='epoch', save_total_limit=6)

trainer = Trainer(model=model, args=args, 
                  train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['valid'], 
                  data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics_)

Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-65580daa9634842c/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-65580daa9634842c/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263273408.0, style=ProgressStyle(descri…




Training data size: 1 positives + 12 negatives: 100%|██████████| 1/1 [00:16<00:00, 16.87s/it] 
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and 

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
1,No log,0.550555,0.0,0.0,0.0,0.994983,6.5561,1.983
2,0.815100,0.337894,0.0,0.0,0.0,0.996912,6.605,1.968


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=6, training_loss=0.6522814631462097, metrics={'train_runtime': 161.3567, 'train_samples_per_second': 0.037, 'total_flos': 11030692654800.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 2319024128, 'train_mem_cpu_peaked_delta': 3832201216})

In [None]:
! ls -lrt test_training

total 8
drwxr-xr-x 2 root root 4096 Jun  3 23:25 checkpoint-3
drwxr-xr-x 2 root root 4096 Jun  3 23:27 checkpoint-6


In [None]:
args = TrainingArguments(output_dir='test_training', num_train_epochs=4, 
                         learning_rate=2e-5, weight_decay=0.01,
                         per_device_train_batch_size=16, per_device_eval_batch_size=16,
                         evaluation_strategy='epoch', logging_steps=4, report_to='none', 
                         save_strategy='epoch', save_total_limit=6)

trainer = Trainer(model=model, args=args, 
                  train_dataset=tokenized_datasets['train'], eval_dataset=tokenized_datasets['valid'], 
                  data_collator=data_collator, tokenizer=tokenizer, 
                  compute_metrics=compute_metrics_)
trainer.train(resume_from_checkpoint='/kaggle/working/test_training/checkpoint-6/')

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Runtime,Samples Per Second
3,0.2473,0.210579,0.0,0.0,0.0,0.996912,6.6056,1.968
4,0.134,0.149079,0.0,0.0,0.0,0.996912,6.3895,2.035


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=12, training_loss=0.08589735627174377, metrics={'train_runtime': 151.8757, 'train_samples_per_second': 0.079, 'total_flos': 21942472878144.0, 'epoch': 4.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1893203968, 'train_mem_cpu_peaked_delta': 2585927680})

In [None]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.14907871186733246,
 'eval_precision': 0.0,
 'eval_recall': 0.0,
 'eval_f1': 0.0,
 'eval_accuracy': 0.996912389038981,
 'eval_runtime': 6.311,
 'eval_samples_per_second': 2.06,
 'epoch': 4.0,
 'eval_mem_cpu_alloc_delta': 28672,
 'eval_mem_cpu_peaked_delta': 0}

## NER inference

**Turn off the Internet here**

In [None]:
#export
def get_ner_inference_data(papers, sample_submission, classlabel=None, 
                           sentence_definition='sentence', max_length=64, overlap=20):
    '''
    Args:
        papers (dict): Each list in this dictionary consists of the section of a paper.
        sample_submission (pd.DataFrame): Competition 'sample_submission.csv'.
    Returns:
        test_rows (list): Each list in this list is of the form: 
             [('goat', 0), ('win', 0), ...] and represents a sentence.  
        paper_length (list): Number of sentences in each paper.
    '''
    test_rows = [] 
    paper_length = [] 

    for paper_id in sample_submission['Id']:
        paper = papers[paper_id]

        sentences = extract_sentences(paper, sentence_definition=sentence_definition)
        sentences = shorten_sentences(sentences, max_length=max_length, overlap=overlap)
        sentences = [sentence for sentence in sentences if len(sentence) > 10] 
        sentences = [sentence for sentence in sentences 
                     if any(word in sentence.lower() for word in ['data', 'study'])]

        for sentence in sentences:
            sentence_words = sentence.split()
            dummy_tags = [classlabel.str2int('O')]*len(sentence_words)
            test_rows.append(list(zip(sentence_words, dummy_tags)))

        paper_length.append(len(sentences))

    print(f'total number of "sentences": {len(test_rows)}')
    return test_rows, paper_length

In [None]:
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test', sample_submission.Id)
classlabel = get_ner_classlabel()
test_rows, paper_length = get_ner_inference_data(papers, sample_submission, classlabel=classlabel, 
                                                 sentence_definition='section', max_length=300, overlap=100)
print(test_rows[1])
print(paper_length)

total number of "sentences": 182
[('Authors', 0), ('the', 0), ('Laboratory', 0), ('for', 0), ('Neuro', 0), ('Imaging', 0), ('at', 0), ('the', 0), ('University', 0), ('of', 0), ('Southern', 0), ('California', 0), ('Finally', 0), ('several', 0), ('publicly', 0), ('available', 0), ('datasets', 0), ('were', 0), ('included', 0), ('we', 0), ('kindly', 0), ('thank', 0), ('the', 0), ('investigative', 0), ('teams', 0), ('and', 0), ('staffs', 0), ('of', 0), ('the', 0), ('Pediatric', 0), ('Imaging', 0), ('Neurocognition', 0), ('and', 0), ('Genetics', 0), ('PING', 0), ('study', 0), ('the', 0), ('Alzheimer', 0), ('s', 0), ('Disease', 0), ('Neuroimaging', 0), ('Initiative', 0), ('ADNI', 0), ('project', 0), ('and', 0), ('the', 0), ('studies', 0), ('who', 0), ('made', 0), ('their', 0), ('data', 0), ('available', 0), ('in', 0), ('dbGaP', 0)]
[16, 87, 40, 39]


In [None]:
#export
def ner_predict(pth=None, tokenizer=None, model=None, metric=None):
    classlabel = get_ner_classlabel()
    datasets = load_ner_datasets(data_files={'test':pth})
    print('Tokenizing testset...')
    tokenized_datasets = datasets.map(
        partial(tokenize_and_align_labels,tokenizer=tokenizer, label_all_tokens=True), 
        batched=True) 

    print('Creating data collator...')
    data_collator = DataCollatorForTokenClassification(tokenizer)
    
    print('Creating (dummy) training arguments...')
    args = TrainingArguments(output_dir='test_ner', num_train_epochs=3, 
                             learning_rate=2e-5, weight_decay=0.01,
                             per_device_train_batch_size=16, per_device_eval_batch_size=16,
                             evaluation_strategy='epoch', logging_steps=4, report_to='none', 
                             save_strategy='epoch', save_total_limit=6)

    print('Creating trainer...')
    word_ids = tokenized_datasets['test']['word_ids']
    compute_metrics_ = partial(compute_metrics, metric=metric, label_list=classlabel.names, word_ids=word_ids)
    trainer = Trainer(model=model, args=args, 
                      train_dataset=tokenized_datasets['test'], eval_dataset=tokenized_datasets['test'], 
                      data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics_)

    print('Predicting on test samples...')
    predictions, label_ids, _ = trainer.predict(tokenized_datasets['test'])
    predictions = predictions.argmax(axis=2)
    predictions = remove_nonoriginal_outputs(predictions, word_ids)
    label_ids   = remove_nonoriginal_outputs(label_ids, word_ids)
    return predictions, label_ids

In [None]:
# This shows where to look for the cached metric `seqeval`.
# metric = load_metric('/root/.cache/huggingface/modules/datasets_modules/metrics/seqeval/ec5b7242a8c40468d189ca0b2b10612578dbcad311b2a134c99e3ded58a0d6e3/seqeval.py')

# Exporting the cached metric 

# %cd /root/.cache
# ! zip -r huggingface_cache.zip huggingface/modules/datasets_modules/metrics/seqeval/ec5b7242a8c40468d189ca0b2b10612578dbcad311b2a134c99e3ded58a0d6e3/
# %cd 

In [None]:
model_checkpoint = 'test_training/checkpoint-6/'

tokenizer = create_tokenizer(model_checkpoint=model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
# metric = load_metric('seqeval')
metric = load_metric('/root/.cache/huggingface/modules/datasets_modules/metrics/seqeval/ec5b7242a8c40468d189ca0b2b10612578dbcad311b2a134c99e3ded58a0d6e3/seqeval.py')

In [None]:
samples = ['''Archaeologists estimate the carvings are between 4,000 and 5,000 years old''', 
           ('''I could see that I was looking at a deer stag upside down, '''
            '''and as I continued looking around, more animals appeared on the rock,” he said.''')]
test_rows = [list(zip(sample.split(), len(sample.split()) * [0])) for sample in samples]
write_ner_json(test_rows, pth='test_ner.json')

predictions, label_ids = ner_predict(pth='test_ner.json', tokenizer=tokenizer, model=model, metric=metric)
for i in range(len(predictions)):
    print(f'Sample {i}:', len(predictions[i]), len(label_ids[i]), len(samples[i].split()))

Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-7e607d25dbe8fe83/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-7e607d25dbe8fe83/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


Sample 0: 11 11 11
Sample 1: 27 27 27


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#export
def get_paper_dataset_labels(pth, paper_length, predictions):
    '''
    Args:
        pth (Path, str): Path to json file containing NER data.  Each row is 
            of form: {'tokens': ['Studying', 'human'], 'ner_tags': [0, 0, ...]}.
    
    Returns:
        paper_dataset_labels (list): Each element is a set consisting of labels predicted
            by the model.
    '''
    test_sentences = [json.loads(sample)['tokens'] for sample in open(pth).readlines()]
    
    paper_dataset_labels = [] # store all dataset labels for each publication
    for ipaper in range(len(paper_length)):
        istart = sum(paper_length[:ipaper])
        iend = istart + paper_length[ipaper]
        
        labels = set()
        for sentence, pred in zip(test_sentences[istart:iend], predictions[istart:iend]):
            curr_phrase = ''
            for word, tag in zip(sentence, pred):
                if tag == 'B': # start a new phrase
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
                    curr_phrase = word
                elif tag == 'I' and curr_phrase: # continue the phrase
                    curr_phrase += ' ' + word
                else: # end last phrase (if any)
                    if curr_phrase:
                        labels.add(curr_phrase)
                        curr_phrase = ''
            # check if the label is the suffix of the sentence
            if curr_phrase:
                labels.add(curr_phrase)
                curr_phrase = ''

        # record dataset labels for this publication
        paper_dataset_labels.append(labels)

    return paper_dataset_labels

In [None]:
sentences = ['They do not present all the features', 
             'Despite the pretraining on the Tigers EcoNAX dataset',
             'Weirdly there has been lots of studies based on WGS Equality Definitiveness Dataset']
paper_length = [2, 1]
test_rows = [[(word, 0) for word in sentence.split()] for sentence in sentences]
predictions = [['O', 'O', 'O', 'B', 'I', 'I', 'O'],
               ['O', 'O', 'O', 'O', 'O', 'B', 'I', 'I'],
               ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'I', 'I', 'I']]
for i, row in enumerate(test_rows):
    assert len(row) == len(predictions[i])

write_ner_json(test_rows, pth='test_ner.json')

paper_dataset_labels = get_paper_dataset_labels('test_ner.json', paper_length, predictions)
print(paper_dataset_labels)

[{'Tigers EcoNAX dataset', 'present all the'}, {'WGS Equality Definitiveness Dataset'}]


In [None]:
#export
def filter_dataset_labels(paper_dataset_labels):
    '''
    When several labels for a paper are too similar, keep just one of them.
    '''
    filtered_dataset_labels = []

    for labels in paper_dataset_labels:
        filtered = []

        for label in sorted(labels, key=len):
            label = clean_training_text(label, lower=True)
            if len(filtered) == 0 or all(jaccard_similarity(label, got_label) < 0.75 for got_label in filtered):
                filtered.append(label)

        filtered_dataset_labels.append('|'.join(filtered))
    return filtered_dataset_labels

In [None]:
paper_dataset_labels = [{'moma artists catalogue', 'moma artists', 'housing market'},
                       {'rhs flowers fertiliser index', 'deep sea rock salts', 'rhs fertiliser index'}]

filter_dataset_labels(paper_dataset_labels)

['moma artists|housing market|moma artists catalogue',
 'deep sea rock salts|rhs fertiliser index']

# Literal matching

In [None]:
#export
def create_knowledge_bank(pth):
    '''
    Args:
        pth (str): Path to meta data like 'train.csv', which
        needs to have columns: 'dataset_title', 'dataset_label', and 'cleaned_label'.
        
    Returns:
        all_labels (set): All possible strings associated with a dataset from the meta data.
    '''
    df = load_train_meta(pth, group_id=False)
    all_labels = set()
    for label_1, label_2, label_3 in df[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
        all_labels.add(str(label_1).lower())
        all_labels.add(str(label_2).lower())
        all_labels.add(str(label_3).lower())
    return all_labels

In [None]:
pth = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
all_labels = create_knowledge_bank(pth)
print(len(all_labels))
print(sorted(all_labels)[:10])

180
['2019 ncov complete genome sequences', '2019 ncov genome sequence', '2019 ncov genome sequences', '2019-ncov complete genome sequences', '2019-ncov genome sequence', '2019-ncov genome sequences', 'adni', 'advanced national seismic system (anss) comprehensive catalog (comcat)', 'advanced national seismic system anss comprehensive catalog comcat ', 'advanced national seismic system comprehensive catalog']


In [None]:
#export
def literal_match(paper, all_labels):
    '''
    Args:
        paper ()
    '''
    text_1 = '. '.join(section['text'] for section in paper).lower()
    text_2 = clean_training_text(text_1, lower=True, total_clean=True)
    
    labels = set()
    for label in all_labels:
        if label in text_1 or label in text_2:
            labels.add(clean_training_text(label, lower=True, total_clean=True))
    return labels

In [None]:
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test/', sample_submission.Id)

pth = Path('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
all_labels = create_knowledge_bank(pth)

literal_preds = []
for paper_id in sample_submission.Id:
    paper = papers[paper_id]
    literal_preds.append('|'.join(literal_match(paper, all_labels)))
    
literal_preds

['alzheimer s disease neuroimaging initiative adni|adni',
 'trends in international mathematics and science study|nces common core of data|common core of data',
 'sea lake and overland surges from hurricanes|slosh model|noaa storm surge inundation',
 'rural urban continuum codes']

# Overall prediction for submission

In [None]:
#export
def combine_matching_and_model(literal_preds, filtererd_dataset_labels):
    '''
    For a given sentence, if there's a literal match, use that as the final
    prediction for the sentence.  If there isn't a literal match,
    use what the model predicts.
    '''
    final_predictions = []
    for literal_match, model_pred in zip(literal_preds, filtered_dataset_labels):
        if literal_match:
            final_predictions.append(literal_match)
        else:
            final_predictions.append(model_pred)
    return final_predictions

In [None]:
literal_preds = ['mongolian racing cars|reallife headphones', '']
filtered_dataset_labels = ['data|dataset', 'hifi dataset|headphones collection data']
combine_matching_and_model(literal_preds, filtered_dataset_labels)

['mongolian racing cars|reallife headphones',
 'hifi dataset|headphones collection data']

In [None]:
model_checkpoint = 'test_training/checkpoint-6/'
model_checkpoint = '/kaggle/input/showusdata-distilbert-base-cased-ner/ner_training_results/checkpoint-72822'

print('Preparing NER inference data...')
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test/', sample_submission.Id)
test_rows, paper_length = get_ner_inference_data(papers, sample_submission, classlabel=classlabel,
                                                 sentence_definition='section', max_length=300, overlap=100)
write_ner_json(test_rows, pth='test_ner.json')

print('Loading model, tokenizer, and metric...')
tokenizer = create_tokenizer(model_checkpoint=model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
metric = load_metric('seqeval')

print('Predicting on each sentence...')
predictions, label_ids = ner_predict('test_ner.json', tokenizer=tokenizer, model=model, metric=metric)
predictions = [[classlabel.int2str(p) for p in pred] for pred in predictions]
label_ids   = [[classlabel.int2str(l) for l in label] for label in label_ids]

print('Getting predicted labels for each article...')
paper_dataset_labels = get_paper_dataset_labels('test_ner.json', paper_length, predictions)

print('Keeping just one of labels that are too similar to each other...')
filtered_dataset_labels = filter_dataset_labels(paper_dataset_labels)

sample_submission['PredictionString'] = filtered_dataset_labels

sample_submission.to_csv('submission.csv', index=False)

Preparing NER inference data...
total number of "sentences": 182
Loading model, tokenizer, and metric...
Predicting on each sentence...
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-f677e8a91f0b8278/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-f677e8a91f0b8278/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


Getting predicted labels for each article...
Keeping just one of labels that are too similar to each other...


In [None]:
! cat submission.csv

Id,PredictionString
2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative adni
2f392438-e215-4169-bebf-21ac4ff253e1,trends in international mathematics and science study
3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes


# Error analysis

In [None]:
model_checkpoint = '/kaggle/input/showusdata-distilbert-base-cased-ner/ner_training_results/checkpoint-72822'
pth_train_json = '/kaggle/input/showus-data-ner-jsons/train_ner.json'
pth_valid_json = '/kaggle/input/showus-data-ner-jsons/valid_ner.json'


classlabel = get_ner_classlabel()

ner_data_train = open(pth_train_json).readlines()[:10]
ner_data_valid = open(pth_valid_json).readlines()[:10]
ner_data_train = [json.loads(sample) for sample in ner_data_train]
ner_data_valid = [json.loads(sample) for sample in ner_data_valid]
ner_data_train = [list(zip(sample['tokens'], sample['ner_tags'])) for sample in ner_data_train]
ner_data_valid = [list(zip(sample['tokens'], sample['ner_tags'])) for sample in ner_data_valid]
write_ner_json(ner_data_train, pth='train_ner.json')
write_ner_json(ner_data_valid, pth='valid_ner.json')

tokenizer = create_tokenizer(model_checkpoint=model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=classlabel.num_classes)
metric = load_metric('seqeval')

predictions, label_ids = ner_predict(pth='valid_ner.json', tokenizer=tokenizer, model=model, metric=metric)
predictions = [[classlabel.int2str(p) for p in pred] for pred in predictions]
label_ids   = [[classlabel.int2str(l) for l in label] for label in label_ids]

paper_dataset_labels = get_paper_dataset_labels('valid_ner.json', len(predictions) * [1], predictions)
gt_paper_dataset_labels = get_paper_dataset_labels('valid_ner.json', len(label_ids) * [1], label_ids)

Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-36caebbbae12bcf8/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-36caebbbae12bcf8/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


In [None]:
print('Predicted labels:')
print(paper_dataset_labels)
print('Ground-truth labels:')
print(gt_paper_dataset_labels)

Predicted labels:
[set(), set(), {'ADNI'}, set(), set(), {'Beginning Postsecondary Students Longitudinal Study'}, {'IBTrACS'}, set(), set(), {'Program for the International Assessment of Adult Competencies'}]
Ground-truth labels:
[set(), set(), {'ADNI'}, set(), set(), {'Beginning Postsecondary Students Longitudinal Study'}, {'IBTrACS'}, set(), set(), set()]


In [None]:
metric.compute(predictions=predictions, references=label_ids)

{'_': {'precision': 0.8, 'recall': 1.0, 'f1': 0.888888888888889, 'number': 4},
 'overall_precision': 0.8,
 'overall_recall': 1.0,
 'overall_f1': 0.888888888888889,
 'overall_accuracy': 0.9974747474747475}

# Reference
- https://www.kaggle.com/tungmphung/pytorch-bert-for-named-entity-recognition/notebook
- https://www.kaggle.com/tungmphung/coleridge-matching-bert-ner/notebook
- https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb
- https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin
- https://huggingface.co/docs/datasets/loading_metrics.html