# Showus: NER Inference (ensemble)

In [1]:
! pip install /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
! pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
! pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
! pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
! pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Processing /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.8.7
    Uninstalling fsspec-0.8.7:
      Successfully uninstalled fsspec-0.8.7
Successfully installed fsspec-2021.4.0
Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.59.0
    Uninstalling tqdm-4.59.0:
      Successf

In [2]:
import sys
from functools import partial
import random
import json
import pandas as pd

from tokenizers.pre_tokenizers import BertPreTokenizer
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_metric
from transformers import TrainingArguments, Trainer

sys.path.append('/kaggle/input/showus-package')
from showus import load_ner_datasets, get_ner_classlabel
from showus import tokenize_and_align_labels
from showus import compute_metrics

from showus import load_papers
from showus import batched_write_ner_inference_json
from showus import create_tokenizer
from showus import batched_ner_predict, get_paper_dataset_labels
from showus import create_knowledge_bank, literal_match
from showus import combine_matching_and_model, filter_dataset_labels

In [3]:
! cp ../input/huggingface-cache/huggingface/modules/datasets_modules/metrics/seqeval/ec5b7242a8c40468d189ca0b2b10612578dbcad311b2a134c99e3ded58a0d6e3/seqeval.py .

In [4]:
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test/', sample_submission.Id)
pth_json = 'test_ner.json'

classlabel = get_ner_classlabel()

print('Preparing NER inference data...')
paper_length = batched_write_ner_inference_json(papers, sample_submission, 
                                                pth=pth_json, batch_size=3_000,
                                                classlabel=classlabel, pretokenizer=BertPreTokenizer(),
                                                sentence_definition='paper', max_length=250, overlap=20,
                                                min_length=0, contains_keywords=None)

Preparing NER inference data...
total number of "sentences": 269


In [5]:
ensembles_inference_kwargs = [
    {'model_name': 'roberta-base',
     'model_checkpoint': '../input/showusdata-roberta-base-ner/training_results_roberta-base/checkpoint-114561',
     'per_device_batch_size': 300, 'batch_size': 64_000}, 
    {'model_name': 'distilbert-base-cased', 
     'model_checkpoint': '../input/showusdata-distilbert-base-cased-ner/training_results_distilbert-base-cased/checkpoint-56997',
     'per_device_batch_size': 300, 'batch_size': 64_000},
    {'model_name': 'distilbert-base-cased', 
     'model_checkpoint': '../input/showusdata-distilbert-base-cased-ner-v22/training_results_distilbert-base-cased/checkpoint-103245', 
     'per_device_batch_size': 300, 'batch_size': 64_000}]

In [6]:
ensemble_model_preds = []

for d in ensembles_inference_kwargs:
    print(f">>>> {d['model_name']}")
    
    print('Loading model, tokenizer, and metric...')
    model_checkpoint = d['model_checkpoint']
    tokenizer = create_tokenizer(model_checkpoint=model_checkpoint)
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
    metric = load_metric('seqeval.py')

    print('Predicting on each sentence...')
    bs = d['per_device_batch_size']
    predictions, _ = batched_ner_predict(
        'test_ner.json', tokenizer=tokenizer, model=model, metric=metric, 
        batch_size=d['batch_size'], per_device_train_batch_size=bs, per_device_eval_batch_size=bs)
    predictions = [[classlabel.int2str(p) for p in pred] for pred in predictions]

    print('Getting predicted labels for each article...')
    paper_dataset_labels = get_paper_dataset_labels('test_ner.json', paper_length, predictions)
    
    ensemble_model_preds.append(paper_dataset_labels)

>>>> roberta-base
Loading model, tokenizer, and metric...
Predicting on each sentence...
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-289df9b14a7caf9b/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-289df9b14a7caf9b/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


completed in 0.01 mins.
Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


  _warn_prf(average, modifier, msg_start, len(result))


completed in 0.11 mins.
Argmaxing...
completed in 0.00 mins.
Removing non-original outputs...completed in 0.00 mins.
Getting predicted labels for each article...
>>>> distilbert-base-cased
Loading model, tokenizer, and metric...
Predicting on each sentence...
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-57a7cb1a864281a9/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-57a7cb1a864281a9/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


completed in 0.01 mins.
Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


completed in 0.06 mins.
Argmaxing...
completed in 0.00 mins.
Removing non-original outputs...

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


completed in 0.00 mins.
Getting predicted labels for each article...
>>>> distilbert-base-cased
Loading model, tokenizer, and metric...
Predicting on each sentence...
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-16718f96bd33f12e/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-16718f96bd33f12e/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


completed in 0.01 mins.
Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


completed in 0.06 mins.
Argmaxing...
completed in 0.00 mins.
Removing non-original outputs...completed in 0.00 mins.
Getting predicted labels for each article...


In [7]:
print('String matching...')
knowledge_bank = create_knowledge_bank('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
literal_preds = []
for paper_id in sample_submission.Id:
    literal_preds.append(literal_match(papers[paper_id], knowledge_bank))

String matching...


In [8]:
print('Combining all sets of predictions...')

all_labels = [[label for label_set in set_tuple for label in label_set] 
              for set_tuple in zip(literal_preds, *ensemble_model_preds)]

Combining all sets of predictions...


In [9]:
print('Keeping just one of labels that are too similar to each other...')
filtered_dataset_labels = filter_dataset_labels(all_labels, max_similarity=1)

Keeping just one of labels that are too similar to each other...


In [10]:
sample_submission['PredictionString'] = filtered_dataset_labels

sample_submission.to_csv('submission.csv', index=False)

In [11]:
! cat submission.csv

Id,PredictionString
2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative adni|adni
2f392438-e215-4169-bebf-21ac4ff253e1,trends in international mathematics and science study|nces common core of data|common core of data
3f316b38-1a24-45a9-8d8c-4e05a42257c6,slosh model|sea lake and overland surges from hurricanes|noaa storm surge inundation
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes|rural
