# Showus: NER Inference

In [1]:
! pip install /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
! pip install datasets --no-index --find-links=file:///kaggle/input/coleridge-packages/packages/datasets
! pip install ../input/coleridge-packages/seqeval-1.2.2-py3-none-any.whl
! pip install ../input/coleridge-packages/tokenizers-0.10.1-cp37-cp37m-manylinux1_x86_64.whl
! pip install ../input/coleridge-packages/transformers-4.5.0.dev0-py3-none-any.whl

Processing /kaggle/input/nlp-packages/datasets/datasets/fsspec-2021.4.0-py3-none-any.whl
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.8.7
    Uninstalling fsspec-0.8.7:
      Successfully uninstalled fsspec-0.8.7
Successfully installed fsspec-2021.4.0
Looking in links: file:///kaggle/input/coleridge-packages/packages/datasets
Processing /kaggle/input/coleridge-packages/packages/datasets/datasets-1.5.0-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/tqdm-4.49.0-py2.py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/huggingface_hub-0.0.7-py3-none-any.whl
Processing /kaggle/input/coleridge-packages/packages/datasets/xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: tqdm, xxhash, huggingface-hub, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.59.0
    Uninstalling tqdm-4.59.0:
      Successf

In [2]:
import sys
from functools import partial
import random
import json
import pandas as pd

from tokenizers.pre_tokenizers import BertPreTokenizer
from transformers import AutoTokenizer, DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
from datasets import load_metric
from transformers import TrainingArguments, Trainer

sys.path.append('/kaggle/input/showus-package')
from showus import load_ner_datasets, get_ner_classlabel
from showus import tokenize_and_align_labels
from showus import compute_metrics

from showus import load_papers
from showus import batched_write_ner_inference_json
from showus import create_tokenizer
from showus import batched_ner_predict, get_paper_dataset_labels
from showus import create_knowledge_bank, literal_match
from showus import combine_matching_and_model, filter_dataset_labels

In [3]:
! cp ../input/huggingface-cache/huggingface/modules/datasets_modules/metrics/seqeval/ec5b7242a8c40468d189ca0b2b10612578dbcad311b2a134c99e3ded58a0d6e3/seqeval.py .

In [4]:
sample_submission = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
papers = load_papers('/kaggle/input/coleridgeinitiative-show-us-the-data/test/', sample_submission.Id)
pth_json = 'test_ner.json'

classlabel = get_ner_classlabel()

print('Preparing NER inference data...')
paper_length = batched_write_ner_inference_json(papers, sample_submission, 
                                                pth=pth_json, batch_size=3_000,
                                                classlabel=classlabel, pretokenizer=BertPreTokenizer(),
                                                sentence_definition='paper', max_length=200, overlap=20,
                                                min_length=0, contains_keywords=None)

Preparing NER inference data...
total number of "sentences": 344


In [5]:
model_checkpoint = '../input/showusdata-roberta-base-ner/training_results_roberta-base/checkpoint-63645'
bs, batch_size = 300, 64_000
max_similarity = 1

print('Loading model, tokenizer, and metric...')
tokenizer = create_tokenizer(model_checkpoint=model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
metric = load_metric('seqeval.py')

print('Predicting on test set with model...')
predictions, label_ids = batched_ner_predict(pth_json, tokenizer=tokenizer, 
                                             model=model, metric=metric, 
                                             batch_size=batch_size, 
                                             per_device_train_batch_size=bs, 
                                             per_device_eval_batch_size=bs)
predictions = [[classlabel.int2str(p) for p in pred] for pred in predictions]
label_ids   = [[classlabel.int2str(l) for l in label] for label in label_ids]

print('Getting predicted labels for each article...')
paper_dataset_labels = get_paper_dataset_labels(pth_json, paper_length, predictions)

print('String matching...')
knowledge_bank = create_knowledge_bank('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')
literal_preds = []
for paper_id in sample_submission.Id:
    literal_preds.append(literal_match(papers[paper_id], knowledge_bank))

print('Combining literal matches and model predictions...')
all_labels = combine_matching_and_model(literal_preds, paper_dataset_labels)

print('Keeping just one of labels that are too similar to each other...')
filtered_dataset_labels = filter_dataset_labels(all_labels, max_similarity=max_similarity)

sample_submission['PredictionString'] = filtered_dataset_labels

sample_submission.to_csv('submission.csv', index=False)

Loading model, tokenizer, and metric...
Predicting on test set with model...
Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-1d60788d72fd7450/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02...


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-1d60788d72fd7450/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02. Subsequent calls will reuse this data.
Tokenizing testset...

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


completed in 0.01 mins.
Creating data collator...
Creating (dummy) training arguments...
Creating trainer...
Predicting on test samples...


  _warn_prf(average, modifier, msg_start, len(result))


completed in 0.12 mins.
Argmaxing...
completed in 0.00 mins.
Removing non-original outputs...completed in 0.00 mins.
Getting predicted labels for each article...
String matching...
Combining literal matches and model predictions...
Keeping just one of labels that are too similar to each other...


In [6]:
! cat submission.csv

Id,PredictionString
2100032a-7c33-4bff-97ef-690822c43466,alzheimer s disease neuroimaging initiative adni|adni
2f392438-e215-4169-bebf-21ac4ff253e1,nces common core of data|common core of data|trends in international mathematics and science study
3f316b38-1a24-45a9-8d8c-4e05a42257c6,noaa storm surge inundation|sea lake and overland surges from hurricanes|slosh model
8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,rural urban continuum codes
