In [None]:
!pip install -U spacy[cuda,transformers,lookups]

# Import needed packages

In [None]:
import pandas as pd


import numpy as np

import random

from tqdm import tqdm

import spacy
from spacy import displacy
from spacy.training import Example
from spacy.util import minibatch
from spacy.tokens import Doc

import json 


import warnings


warnings.filterwarnings('ignore')

In [None]:
spacy.prefer_gpu() # let spacy using the gpu

In [None]:
with open('../input/medical-ner/Corona2.json','rb') as f:
    data = json.load(f)['examples'] # reading the dataset

In [None]:
# preprocessing data to be on the same structure as the model want
train_data = []
for d in data :
    entities = []
    for annot in d['annotations']:
        if len(annot["value"]) == len(annot["value"].strip()):
            if len(annot['human_annotations']) == 0:
                    continue
            entities.append((annot['start'],annot['end'],annot['tag_name']))
    
    
    if len(entities)> 0:    
        train_data.append(([d['content'],{'entities' : entities}]))

In [None]:
nlp = spacy.blank('en') #assign blacn model

In [None]:
# Construction via add_pipe with custom config
config = {
    "model": {
        "@architectures": "spacy-transformers.TransformerModel.v1",
        "name": "bert-base-cased",
        "tokenizer_config": {"use_fast": True}
    }
}
trf = nlp.add_pipe("transformer", config=config) #adding the bert model to our pipline

In [None]:
trf = nlp.add_pipe('ner') # adding ner head to the transformer

In [None]:
nlp.pipeline #items in pipline

In [None]:
ner = nlp.get_pipe('ner') 

In [None]:
# getting the unique annotaions in dataset
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [None]:
ner.labels

In [None]:
optimizer = nlp.begin_training() #setting the adam optimizer
for itn in range(20):
    random.shuffle(train_data) #shuffling the data
    losses = {}
    for batch in minibatch(train_data, size=32): #loop for training the model with minibatches of size 32
        for text, annotations in tqdm(batch):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.3, sgd=optimizer, losses=losses) #assigning the compile paramters 
    print(losses)

In [None]:
nlp.to_disk('./') #save the model 


In [None]:
from spacy.scorer import Scorer

# Provided scoring pipeline
bert = spacy.load('./') #loading the saved model
scorer = Scorer() #scorer library that used to get the f1 score, percision,recall  for ner model

In [None]:
#getting the f1 scores of the traning 
examples = []
for input_, annot in train_data:
        doc_gold_text = bert.make_doc(input_)
        example = Example.from_dict(doc_gold_text, annot)
        example.predicted = bert(str(example.predicted))
        examples.append(example)
scorer.score(examples)

In [None]:
doc = train_data[4][0] #getting the first sentence on the shuffled trainning set

In [None]:
doc = bert(doc) #insert the sentence to our pipline

In [None]:
# This function is used to render the doc with there entities using highlites colors
def render(doc):
    colors = {"Pathogen": "linear-gradient(90deg, #aa9cfc, #fc9ce7)","MedicalCondition": "#ff99b6","Medicine":"#99f3ff"}
    options = {"ents": ["Pathogen","MedicalCondition","Medicine"], "colors": colors}
    displacy.render(doc,style='ent',options=options)

In [None]:
render(doc)