# This notebook is used to draft a sample NLP pipeline with CORD 

Sample structure

Loader => Sentencer => NER => 

In [2]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import json
import os

In [2]:
# If the whole CORD dataset should be loaded set exc_cord_loader as True
# If the downloader script should be run to get a set of articles with PMID, set False
exc_cord_loader = True

### The Cord Loader script creates a JSON file from the metadata.csv file from the CORD-19 dataset of journals
The dataset can be found at: https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases.html

In [3]:
if exc_cord_loader == True:
    from scripts import cord_loader

    cord_loader_input = '../data/cord/metadata.csv'
    cord_loader_output = '../data/cord/metadata.json'

    cord_loader.run(cord_loader_input, cord_loader_output)
    print('fin')

The cord loader seems to execute without issue on the latest version of CORD

### Alternatively, the downloader script uses a text file with PUBMED IDs, and loads those articles into a JSON 

In [4]:
if exc_cord_loader == False:
    from scripts import downloader
    
    downloader_input = "../data/example_pmid_list.txt"
    downloader_output = "../data/example_pmid_list.json"
    downloader_batch_size = 4
    
    downloader.run(
    input_file=downloader_input,
    output_file=downloader_output,
    batch_size=downloader_batch_size,
    )


Downloading and saving batch 1...
Saved 4/5 articles so far.

Downloading and saving batch 2...
Saved 5/5 articles so far.



### We run the sentencer script based on the JSON files.
As a result, we get a utf8 encoded json file with sentences

In [4]:
from scripts import splitter

sentencer_input = "../data/cord/metadata.json"
sentencer_output = "../data/cord/metadata-sentences.json"

# Load the metadata.json file with abstracts
with open(sentencer_input, "r", encoding='utf8') as f:
    full_articles = json.loads(f.read())

articles = {}

# create sentences using the splitter script
for id, article in tqdm(full_articles.items()):
    articles[id] = {
        # **articles[id], # include other fields
        "title": article["title"],
        "sentences": list(map(
            lambda sentence: {"text": sentence},
            splitter.split_into_sentences(article["abstract"])
        ))
    }

# Notice the change of encoding utf8
with open(sentencer_output, "w",encoding='utf8') as f:
    f.write(json.dumps(articles, indent=2, ensure_ascii=False))



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=279304.0), HTML(value='')))




### Run Named Entity Recognition with the help of BioBERT
Download the BioBERT model from: https://drive.google.com/drive/folders/1neThCq4MqFPd0133WDDC4MYUycE84fT7?usp=sharing
Save the model in models directory. This model has been fine tuned on BC5CDR-chem dataset

Make sure to install tf2onnx by 

```
pip install -U tf2onnx
pip install onnxruntime
pip install transformers
pip install 
```

For fine tuning and training BERT, I need to look at the utils/chemprot/bert_finetune.py code in depth

Note: changed util f.read to utf8

In [2]:
from scripts import util
from scripts.ner_inference import NERInferenceSession
from scripts.entity_parser import co_occurrence_extractor, detokenize


ner_input = "../data/cord/metadata-sentences.json"
ner_output = "../data/cord/metadata-ner-done.json"
model_dir = "../models/biobert/"
model_name = "biobert_ner.onnx"
model_vocab = "vocab.txt"
labels = ["[PAD]", "B", "I", "O", "X", "[CLS]", "[SEP]"]
clear_old_results = True
article_limit = 2

with open(ner_input, "r", encoding='utf8') as f:
    articles = json.loads(f.read())

print("Creating NER session...")
ner_session = NERInferenceSession(
    model_dir=model_dir,
    model_name=model_name,
    model_vocab=model_vocab,
    labels=labels,
)
print("Created NER session.")

# For experimentation: limit number of articles to process (and to output)
limit = article_limit
if limit > 0:
    print(f"Limiting NER to {limit} articles.")
    a = {}
    i = 0
    for id in articles:
        if i >= limit:
            break
        a[id] = articles[id]
        i += 1
    articles = a

#clear old results
if clear_old_results==True:
    try:
        os.remove(ner_output)
    except OSError:
        pass

# Becuase we want to save the result periodically.
batch_index = 0
batch_size = 2

# Run prediction on each sentence in each article.
for pmid in tqdm(articles):
    if batch_index > batch_size:
        util.append_to_json_file(ner_output, articles)
        batch_index = 0
    sentences = articles[pmid]["sentences"]
    for i, sentence in enumerate(sentences):
        token_label_pairs = ner_session.predict(sentence["text"])
        x = co_occurrence_extractor(detokenize(token_label_pairs))
        articles[pmid]["sentences"][i]["entities"] = x["entities"]
        articles[pmid]["sentences"][i]["text_new"] = x["text"]
    batch_index += 1

util.append_to_json_file(ner_output, articles)

print("Finished running NER script.")



Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


Creating NER session...
Loading model:
  ../models/biobert/biobert_ner.onnx
Model loaded succesfully

Created NER session.
Limiting NER to 2 articles.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



Finished running NER script.


### Relationship extraction code, even though described by the authors, was not available in the repo. 

### Run the analysis code from the repo

In [6]:
from scripts import analysis

analysis_input = "../data/metadata-ner-done.json"
# analysis_output = '../data/metadata-analysis-'
with open(analysis_config["input_path"], "r") as f:
    articles = json.loads(f.read())

analysis.run(articles, output_path)

ImportError: cannot import name 'ft2font' from partially initialized module 'matplotlib' (most likely due to a circular import) (C:\Users\rafsa\anaconda3\envs\tf_gpu\lib\site-packages\matplotlib\__init__.py)

### Get evaluation metrics

In [3]:
from scripts import metrics
from scripts.ner_inference import NERInferenceSession
from scripts.entity_parser import co_occurrence_extractor, detokenize

gold_standard_path = '../data/NER_data/BC4CHEMD/'#"../data/gold-standard/BC4CHEMD/"
metrics_output_path = "../data/metrics_BC4CHEMD.json"
biobert_path = "../models/biobert"
biobert_metrics = True
bilstm_metrics = True
co_occurrence_metrics = True

# ner_input = "../data/cord/metadata-sentences.json"
# ner_output = "../data/cord/metadata-ner-done.json"
model_dir = "../models/biobert/"
model_name = "biobert_ner.onnx"
model_vocab = "vocab.txt"
labels = ["[PAD]", "B", "I", "O", "X", "[CLS]", "[SEP]"]
# clear_old_results = True
# article_limit = 100

ner_session = NERInferenceSession(
        model_dir=model_dir,
        model_name=model_name,
        model_vocab=model_vocab,
        labels=labels,
)

dir = gold_standard_path

open(metrics_output_path, "w").close()

files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
for file in files:
    with open(metrics_output_path, "a+") as out_f:
        out_f.write("\n\n" + "-"*10 + file + "-"*10)
    metrics.gs_metrics(dir + file)
    metrics.biobert_metrics(ner_session, dir + file, metrics_output_path)

print("Finished running metrics script.")

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


Loading model:
  ../models/biobert/biobert_ner.onnx
Model loaded succesfully

 - - - Gold standard metrics - - -
Label count:
	O label count: 823456
	B label count: 29486
	I label count: 34863

Occurrence count:
	0_occurrence count: 16071
	1_occurrence count: 7286
	2_occurrence count: 3893
	3_occurrence count: 1629
	4_occurrence count: 780
	5_occurrence count: 445
	6_occurrence count: 216
	7_occurrence count: 108
	8_occurrence count: 81
	9_occurrence count: 43
	10_occurrence count: 29
	11_occurrence count: 19
	12_occurrence count: 8
	13_occurrence count: 11
	14_occurrence count: 5
	15_occurrence count: 5
	16_occurrence count: 5
	20_occurrence count: 2
	25_occurrence count: 1
	29_occurrence count: 1
	38_occurrence count: 1
 - - - - - - - - - - - - - - - - - 

Running over 30639 sentences
Predicted 1/30639 sentences so far.

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Predicted 3/30639 sentences so far.



processed 882816 tokens with 29277 phrases; found: 24917 phrases; correct: 17417.
accuracy:  96.76%; precision:  69.90%; recall:  59.49%; FB1:  64.28

Confusion matrix:
{'true_negative': 813027, 'true_positive': 42216, 'false_positive': 6867, 'false_negative': 20601}
Recall: 0.6720473757103969
Precision: 0.8600941262759

Token matrix:
{'O': defaultdict(<class 'int'>, {'O': 813027, 'B': 4301, 'I': 2566, '[SEP]': 100, '[PAD]': 4}), 'B': defaultdict(<class 'int'>, {'B': 18561, 'O': 10325, 'I': 390, '[SEP]': 1}), 'I': defaultdict(<class 'int'>, {'I': 22601, 'O': 10276, 'B': 664})}

 - - - Gold standard metrics - - -
Label count:
	O label count: 712648
	B label count: 25346
	I label count: 29642

Occurrence count:
	0_occurrence count: 13936
	1_occurrence count: 6141
	2_occurrence count: 3276
	3_occurrence count: 1451
	4_occurrence count: 749
	5_occurrence count: 363
	6_occurrence count: 181
	7_occurrence count: 101
	8_occurrence count: 56
	9_occurrence count: 37
	10_occurrence count: 22
	11

0

<torch.cuda.device at 0x1ffb0749610>

1

'GeForce RTX 2060'

True

RuntimeError: cudaGetDevice() failed. Status: cudaGetErrorString symbol not found.

AttributeError: module 'tensorflow' has no attribute 'Session'

'abcabc'

abcabc abc xyz
['', '', '']
abc
abcxyz


'bc'