## load the model which has been trained in the other notebook and use for inference

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf
print(tf.config.list_physical_devices("GPU"))

from IPython.display import display, HTML

from transformers import create_optimizer
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import TFAutoModelForTokenClassification
from tqdm import tqdm
import pickle

model_name = "google/electra-base-discriminator"

import pandas as pd
from datetime import datetime

tokenizer = AutoTokenizer.from_pretrained(model_name, return_offsets_mapping=True)

import spacy
nlp = spacy.load("en_core_web_lg")

model_name = "google/electra-base-discriminator"


# used for common BERT model -> can retrieve chemicals and roles
id2label = {
    0: "O",
    1: "B-chemical",
    2: "I-chemical",
    3: "B-role",
    4: "I-role"
}
label2id = {
    "O": 0,
    "B-chemical": 1,
    "I-chemical": 2,
    "B-role": 3,
    "I-role": 4,    
}

html_elems = {
    1: ("<b style='font-size:1.5em;'>", "</b>"),
    3: ("<b style='color:blue; font-size:1.5em;'><i>","</i></b>")    
}

def load_model(folder, filename):
    model = TFAutoModelForTokenClassification.from_pretrained(
        os.path.join(folder, filename), num_labels=len(id2label.keys()), id2label=id2label, label2id=label2id
    )
    
    return model



[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [2]:
# model = load_model("./model", f"chemical_substances_roles_extract_{model_name.replace('/', '-')}")
model = load_model("./model", f"chemical_extract_{model_name.replace('/', '-')}")
print("model:", model)
model.summary()

All model checkpoint layers were used when initializing TFElectraForTokenClassification.

All the layers of TFElectraForTokenClassification were initialized from the model checkpoint at ./model/chemical_extract_google-electra-base-discriminator.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFElectraForTokenClassification for predictions without further training.


model: <transformers.models.electra.modeling_tf_electra.TFElectraForTokenClassification object at 0x7fd561b50d50>
Model: "tf_electra_for_token_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 electra (TFElectraMainLaye  multiple                  108891648 
 r)                                                              
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 108895493 (415.40 MB)
Trainable params: 108895493 (415.40 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [3]:
from transformers import TFAutoModelForTokenClassification
import json
from IPython.display import display, HTML



# return spans with start and end index and a class
def infer_text(model, tokenizer, text):
    specials = []
        
    tokenized = tokenizer(text, return_tensors="tf")
    offset_mapping = tokenizer(text, return_offsets_mapping=True)["offset_mapping"]    
    
    logits = model(**tokenized).logits
    predicted_probs = tf.nn.softmax(logits, axis=2)    
    
    predicted_token_class_ids = tf.math.argmax(logits, axis=-1)[0]  
    predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids.numpy().tolist()]
    probs = tf.math.reduce_max(predicted_probs, axis=-1)

    # for l, t in zip(predicted_token_class_ids, tokenizer.convert_ids_to_tokens(tokenized["input_ids"][0])):        
    #     print(id2label[int(l)], t)

    label, start, end = 0, 0, 0    

    i = 0
    while i < len(predicted_token_class_ids):
        l = predicted_token_class_ids[i]
        if l % 2 == 1:
            label = int(l)
            start = offset_mapping[i][0]
            while i+1 < len(predicted_token_class_ids) and predicted_token_class_ids[i + 1] == l+1:
                i += 1
            end = offset_mapping[i][1]
            specials.append((label, start, end))
        i += 1

    return specials

def render_as_html(text, specials):
    start_dict = {}
    end_dict = {}
    for s in specials:
        start_dict[s[1]] = html_elems[s[0]][0]
        end_dict[s[2]] = html_elems[s[0]][1]
    html = ""    
    for i, c  in enumerate(text):                
        start_elem = start_dict.get(i, None)
        end_elem = end_dict.get(i, None)

        if start_elem:
            html += start_elem
        if end_elem:
            html += end_elem
        html += c
    display(HTML(html))

text = """Due to the aprotic nature the solvent with not be able to form hydrogen bond with Pro and Hyp Interstitially, when we assessed self-assembling behavior of Pro and Hyp 
in this solvent system we could not assess any structure formation for both Pro and Hyp."""

specials = infer_text(model, tokenizer, text)
render_as_html(text, specials)


In [4]:
# tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
tokenizer = AutoTokenizer.from_pretrained(model_name)

text = """Based on our experience in the β- and γ-C(sp3)-H\nfunctionalization of free carboxylic acids, we thus expected that\nthe identification of a suitable ligand would be crucial for the\ndevelopment of the desired alkynylation process.\n\n'"""

tokens = tokenizer.convert_ids_to_tokens(tokenizer(text, return_offsets_mapping=True)["input_ids"])
print(tokens)
classifier = pipeline("ner", tokenizer=tokenizer, model=model)
classifier(text)[0:3]

specials = infer_text(model, tokenizer, text)
print(specials)
render_as_html(text, specials)

['[CLS]', 'based', 'on', 'our', 'experience', 'in', 'the', 'β', '-', 'and', 'γ', '-', 'c', '(', 'sp', '##3', ')', '-', 'h', 'functional', '##ization', 'of', 'free', 'car', '##box', '##yl', '##ic', 'acids', ',', 'we', 'thus', 'expected', 'that', 'the', 'identification', 'of', 'a', 'suitable', 'ligand', 'would', 'be', 'crucial', 'for', 'the', 'development', 'of', 'the', 'desired', 'al', '##ky', '##ny', '##lation', 'process', '.', "'", '[SEP]']
[(1, 0, 0), (1, 31, 48), (1, 75, 91), (3, 148, 154), (1, 0, 0)]


In [5]:
print("\n\n\n")
text = """Atomic Fe in N-doped carbon (FeNC) electrocatalysts for oxygen (O2) reduction at the cathode of proton exchange membrane fuel cells (PEMFCs) 
are the most promising alternative to platinum-group-metal catalysts. Despite recent progress on atomic FeNC O2 reduction, their controlled synthesis and stability 
for practical applications remains challenging. A two-step synthesis approach has recently led to significant advances in terms of Fe-loading and mass activity; 
however, the Fe utilisation remains low owing to the difficulty of building scaffolds with sufficient porosity that electrochemically exposes the active sites. 
Herein, we addressed this issue by coordinating Fe in a highly porous nitrogen doped carbon support (~3295 m2 g-1), prepared by pyrolysis of inexpensive 
2,4,6-triaminopyrimidine and a Mg2+ salt active site template and porogen. Upon Fe coordination, a high electrochemical active site density of 2.54×10^19 
sites gFeNC-1 and a record 52% FeNx electrochemical utilisation based on in situ nitrite stripping was achieved. The Fe single atoms are characterised pre- 
and post-electrochemical accelerated stress testing by aberration-corrected high-angle annular dark field scanning transmission electron microscopy, showing no Fe 
clustering. Moreover, ex situ X-ray absorption spectroscopy and low-temperature Mössbauer spectroscopy suggest the presence of penta-coordinated Fe sites, which 
were further studied by density functional theory calculations.
"""
specials = infer_text(model, tokenizer, text)
render_as_html(text, specials)








In [6]:
print("\n\n\n")
text = """Direct β- and γ-C(sp3)–H Alkynylation of Free Carboxylic Acids\nFrancesca Ghiringhelli,[a] Manuel van Gemmeren*[a]\n\n[a]\n\nF."""
specials = infer_text(model, tokenizer, text)
render_as_html(text, specials)







In [7]:
CACHED_ARTICLES_DIR = "/local/sps-local/docs"

# read json document and return content as a json object
def get_json_from_file(json_file):
    with open(json_file, "r") as f:
        return json.loads(f.read())
    
def recursively_collect_files():
    filepaths = []
    for root, dirs, files in os.walk(CACHED_ARTICLES_DIR):
        for filename in files:            
            if filename.endswith(".json") and not filename.endswith("-substances.json"):                   
                filepaths.append(os.path.join(root, filename))
    return filepaths


# def recursively_collect_sentences():
#     sentences_words, role_labels = [], []
#     file_counter = 0
#     for root, dirs, files in os.walk(CACHED_ARTICLES_DIR):
#         for filename in files:            
#             if filename.endswith(".json") and not filename.endswith("-substances.json"):                                
#                 file_counter += 1
#                 if file_counter % 10 == 0:
#                     print(f"collected sentences from {file_counter} files.")
#                 json_data = get_json_from_file(os.path.join(root, filename))
#                 filehash = json_data["fileHash"]
#                 contenthash = json_data["contentHash"]
#                 texthash = json_data["textHash"]
#                 origpath = json_data["filepath"]
    
#                 pages = [page for page in json_data["pages"]]
                  
#                 for page in pages:
#                     doc = nlp(page["text"])
#                     for sentence in doc.sents:                        
#                         pageNumber = int(page["pageNumber"])
#                         infer(model, tokenizer, sentence.text)
    
#     return filename, origpath, sentences_words

# recursively_collect_sentences()

filepaths = recursively_collect_files()

In [8]:
def collect_relevant_sentences(filepath):    
    """
    collect sentences which have at least one chemical and one role
    """
    sentences = []    
    json_data = get_json_from_file(filepath)
    filehash = json_data["fileHash"]
    contenthash = json_data["contentHash"]
    texthash = json_data["textHash"]
    origpath = json_data["filepath"]

    pages = [page for page in json_data["pages"]]
    
    for page in pages:
        doc = nlp(page["text"])
        for sentence in doc.sents:                        
            page_number = int(page["pageNumber"])
            specials = infer_text(model, tokenizer, sentence.text)
            contains_chem = False
            contains_role = False
            for s in specials:
                if s[0] == 1:
                    contains_chem = True
                if s[0] % 2 == 1 and s[0] > 1:
                    contains_role = True
            if contains_chem and contains_role:                
                sentences.append((filepath, page_number, doc[sentence.start].idx, specials, sentence.text))
    return sentences
     

def load_relevant_sentences():
    if os.path.isfile("/local/sps-local/ner-role-extraction/relevant_sentences.pkl"):
        with open("/local/sps-local/ner-role-extraction/relevant_sentences.pkl", "rb") as f:
            return pickle.load(f)
    else:
        return []

def pickle_relevant_sentences():
    with open("/local/sps-local/ner-role-extraction/relevant_sentences.pkl", "wb") as f:
        pickle.dump(sentences, f)


# offset and limit for training

roughly 1000 files take 10 hours and 10 minutes
there is probably a memory leak as the process freezes after 1950 documents have been processed

In [None]:
offset = 7000
limit = 1000
sentences = load_relevant_sentences()

with open("/local/sps-local/ner-role-extraction/ner-role-inferer.log", "w") as log:    
    log.write(f"{datetime.now()}: starting at offset {offset} and stopping at {offset+limit}\n")
    log.flush()
    for filepath in tqdm(filepaths[offset:offset+limit]):    
        offset += 1
        if offset%10 == 0:
            log.write(f"{datetime.now()}: attempting saving {offset}\n")
            log.flush()
            pickle_relevant_sentences()
            log.write(f"{datetime.now()}: done {offset}\n")
            log.flush()
        sentences.extend(collect_relevant_sentences(filepath))

pickle_relevant_sentences()

  0%|▏                                                                                                                                                    | 2/1200 [01:13<12:07:47, 36.45s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors
 28%|█████████████████████████████████████████▏                                                                                                        | 339/1200 [3:24:50<8:14:14, 34.44s/it]

In [None]:
text = """Biomolecules in microbes related to CO2 -sensitive pathways or acting as a CO2 trans-
ducer have been proposed as appealing targets for medicines, since they control cell devel-
opment and the subsequent synthesis of chemicals, enhancing the pathogen persistence
in the host [26,27]. In this context, a crucial role is played by a superfamily of molecules
known as carbonic anhydrases (CAs, EC 4.2.1.1). CAs can be thought as molecules that,
rather than instantly detecting a change in CO2 , serve as CO2 transducers, adjusting its
levels [23,28]. With their activity, the CAs encoded by the bacterial genome of pathogenic
and non-pathogenic bacteria provide the indispensable CO2 and HCO3 − /protons to micro-
bial biosynthetic pathways, catalyzing the reversible reaction of CO2 hydration to HCO3 −
and H+(CO2+H2OHCO3−+H+)"""
specials = infer_text(model, tokenizer, text)
render_as_html(text, specials)

print()
text = """Moreover, LPE of pristine biochars in dimethyl carbonate, ethyl acetate, and solketal gave similar yields to more commonly used solvent for this process, N-methyl-2-pyrrolidone (NMP) a known reprotoxic molecule."""
specials = infer_text(model, tokenizer, text)
render_as_html(text, specials)


In [None]:
from numba import cuda
device = cuda.get_current_device()
device.reset()