In [1]:
import json
import pickle
from pathlib import Path

import spacy
import random
from tqdm import tqdm

In [2]:
with open('Corona2.json') as cs:
    data = json.load(cs)


In [3]:
### Sample Data
### [
###  ('Content1',{'entities':[(7,19,'PERSON),(21,28,'LOC)]}),
###  ('Content2',{'entities':[(7,19,'PERSON),(21,28,'LOC)]}),
### ]

In [17]:
spacy_data=[]
for element in data['examples']:
    ent_dict={}
    ent_list=[]
    index=[]
    for ent in element['annotations']:
        if ent['human_annotations']!=[] and ent['start'] not in index and ent['end'] not in index:
            index.append(ent['start'])
            index.append(ent['end'])
            ent_list.append((ent['start'],ent['end'],ent['tag_name']))
    ent_dict['entities']=ent_list
    spacy_data.append((element['content'],ent_dict))

In [18]:
spacy_data

[("While bismuth compounds (Pepto-Bismol) decreased the number of bowel movements in those with travelers' diarrhea, they do not decrease the length of illness.[91] Anti-motility agents like loperamide are also effective at reducing the number of stools but not the duration of disease.[8] These agents should be used only if bloody diarrhea is not present.[92]\n\nDiosmectite, a natural aluminomagnesium silicate clay, is effective in alleviating symptoms of acute diarrhea in children,[93] and also has some effects in chronic functional diarrhea, radiation-induced diarrhea, and chemotherapy-induced diarrhea.[45] Another absorbent agent used for the treatment of mild diarrhea is kaopectate.\n\nRacecadotril an antisecretory medication may be used to treat diarrhea in children and adults.[86] It has better tolerability than loperamide, as it causes less constipation and flatulence.[94]",
  {'entities': [(360, 371, 'Medicine'),
    (383, 408, 'Medicine'),
    (104, 112, 'MedicalCondition'),
 

In [6]:
with open('spacy_data.pkl', 'wb') as file:
      
    # A new file will be created
    pickle.dump(spacy_data, file)

In [19]:
model = None
output_dir=Path("model")
n_iter=100

In [20]:
#load the model

if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('en')  
    print("Created blank 'en' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner', last=True)
else:
    ner = nlp.get_pipe('ner')

Created blank 'en' model


In [21]:
from spacy.training.example import Example

In [22]:

for _, annotations in spacy_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(spacy_data)
        losses = {}
        for text, annotations in tqdm(spacy_data):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update(
                [example],  
                drop=0.5,  
                sgd=optimizer,
                losses=losses)
        print(losses)

100%|██████████| 31/31 [00:02<00:00, 14.27it/s]


{'ner': 1487.6876494548826}


100%|██████████| 31/31 [00:01<00:00, 17.87it/s]


{'ner': 409.6899568421771}


100%|██████████| 31/31 [00:01<00:00, 17.79it/s]


{'ner': 410.29770855103436}


100%|██████████| 31/31 [00:01<00:00, 17.14it/s]


{'ner': 368.3209995887102}


100%|██████████| 31/31 [00:01<00:00, 18.13it/s]


{'ner': 434.19761062086513}


100%|██████████| 31/31 [00:01<00:00, 17.53it/s]


{'ner': 336.4529637178209}


100%|██████████| 31/31 [00:01<00:00, 17.85it/s]


{'ner': 311.87992183529434}


100%|██████████| 31/31 [00:01<00:00, 18.54it/s]


{'ner': 300.4967065427707}


100%|██████████| 31/31 [00:02<00:00, 15.42it/s]


{'ner': 299.7246766335697}


100%|██████████| 31/31 [00:01<00:00, 17.52it/s]


{'ner': 283.86304456648315}


100%|██████████| 31/31 [00:01<00:00, 18.13it/s]


{'ner': 399.01656775192396}


100%|██████████| 31/31 [00:01<00:00, 17.79it/s]


{'ner': 336.30730202690984}


100%|██████████| 31/31 [00:01<00:00, 17.89it/s]


{'ner': 254.49999200279734}


100%|██████████| 31/31 [00:01<00:00, 16.93it/s]


{'ner': 262.3687446038126}


100%|██████████| 31/31 [00:02<00:00, 15.30it/s]


{'ner': 378.2856029932058}


100%|██████████| 31/31 [00:01<00:00, 18.58it/s]


{'ner': 295.153453018392}


100%|██████████| 31/31 [00:02<00:00, 14.87it/s]


{'ner': 272.4212470928402}


100%|██████████| 31/31 [00:02<00:00, 14.95it/s]


{'ner': 209.60336728600566}


100%|██████████| 31/31 [00:01<00:00, 19.03it/s]


{'ner': 186.5048068318966}


100%|██████████| 31/31 [00:01<00:00, 15.73it/s]


{'ner': 193.23918041846264}


100%|██████████| 31/31 [00:01<00:00, 19.27it/s]


{'ner': 193.91510586374847}


100%|██████████| 31/31 [00:01<00:00, 19.04it/s]


{'ner': 182.24471260717362}


100%|██████████| 31/31 [00:01<00:00, 16.05it/s]


{'ner': 174.94007524454912}


100%|██████████| 31/31 [00:01<00:00, 18.01it/s]


{'ner': 186.99569236447778}


100%|██████████| 31/31 [00:02<00:00, 14.27it/s]


{'ner': 186.04348307845441}


100%|██████████| 31/31 [00:02<00:00, 13.97it/s]


{'ner': 168.83417446257536}


100%|██████████| 31/31 [00:01<00:00, 17.70it/s]


{'ner': 152.6255824840854}


100%|██████████| 31/31 [00:01<00:00, 18.42it/s]


{'ner': 199.81538871196634}


100%|██████████| 31/31 [00:01<00:00, 18.55it/s]


{'ner': 139.5365448083582}


100%|██████████| 31/31 [00:01<00:00, 18.93it/s]


{'ner': 149.465723819517}


100%|██████████| 31/31 [00:01<00:00, 18.89it/s]


{'ner': 155.5501178670923}


100%|██████████| 31/31 [00:01<00:00, 18.85it/s]


{'ner': 149.86793407775613}


100%|██████████| 31/31 [00:01<00:00, 18.65it/s]


{'ner': 145.0198050065652}


100%|██████████| 31/31 [00:01<00:00, 17.82it/s]


{'ner': 112.36208550810377}


100%|██████████| 31/31 [00:01<00:00, 18.32it/s]


{'ner': 129.7637124621374}


100%|██████████| 31/31 [00:01<00:00, 18.32it/s]


{'ner': 129.99615414460936}


100%|██████████| 31/31 [00:01<00:00, 18.11it/s]


{'ner': 108.57149729719622}


100%|██████████| 31/31 [00:01<00:00, 16.76it/s]


{'ner': 105.89051451989657}


100%|██████████| 31/31 [00:01<00:00, 18.36it/s]


{'ner': 113.09321377489002}


100%|██████████| 31/31 [00:01<00:00, 17.37it/s]


{'ner': 118.20057571160038}


100%|██████████| 31/31 [00:01<00:00, 17.74it/s]


{'ner': 99.5165604966031}


100%|██████████| 31/31 [00:01<00:00, 17.97it/s]


{'ner': 119.06650662525679}


100%|██████████| 31/31 [00:01<00:00, 18.38it/s]


{'ner': 116.43481204434536}


100%|██████████| 31/31 [00:01<00:00, 17.77it/s]


{'ner': 134.84378828236856}


100%|██████████| 31/31 [00:01<00:00, 17.12it/s]


{'ner': 103.426892757242}


100%|██████████| 31/31 [00:01<00:00, 17.94it/s]


{'ner': 98.47449270583178}


100%|██████████| 31/31 [00:01<00:00, 18.34it/s]


{'ner': 91.57430931086508}


100%|██████████| 31/31 [00:01<00:00, 18.04it/s]


{'ner': 94.53288237527117}


100%|██████████| 31/31 [00:02<00:00, 14.88it/s]


{'ner': 113.77556824401115}


100%|██████████| 31/31 [00:02<00:00, 13.96it/s]


{'ner': 94.8467335334057}


100%|██████████| 31/31 [00:01<00:00, 16.90it/s]


{'ner': 91.02861689287217}


100%|██████████| 31/31 [00:01<00:00, 17.66it/s]


{'ner': 95.53783199707262}


100%|██████████| 31/31 [00:01<00:00, 18.41it/s]


{'ner': 92.73906556500553}


100%|██████████| 31/31 [00:02<00:00, 14.37it/s]


{'ner': 83.07967971430159}


100%|██████████| 31/31 [00:02<00:00, 14.79it/s]


{'ner': 93.51813934437689}


100%|██████████| 31/31 [00:01<00:00, 18.74it/s]


{'ner': 92.15895211475883}


100%|██████████| 31/31 [00:01<00:00, 18.09it/s]


{'ner': 91.5033050702616}


100%|██████████| 31/31 [00:01<00:00, 18.60it/s]


{'ner': 77.4986417932769}


100%|██████████| 31/31 [00:01<00:00, 17.84it/s]


{'ner': 73.79004474982605}


100%|██████████| 31/31 [00:01<00:00, 16.93it/s]


{'ner': 94.98744402517431}


100%|██████████| 31/31 [00:02<00:00, 13.13it/s]


{'ner': 78.6462980610883}


100%|██████████| 31/31 [00:02<00:00, 13.91it/s]


{'ner': 67.00836409434957}


100%|██████████| 31/31 [00:01<00:00, 16.98it/s]


{'ner': 62.18813415514236}


100%|██████████| 31/31 [00:02<00:00, 14.33it/s]


{'ner': 82.44405587144769}


100%|██████████| 31/31 [00:04<00:00,  6.41it/s]


{'ner': 78.33282674510409}


100%|██████████| 31/31 [00:02<00:00, 12.41it/s]


{'ner': 71.13420475316575}


100%|██████████| 31/31 [00:01<00:00, 17.49it/s]


{'ner': 73.40656924884237}


100%|██████████| 31/31 [00:01<00:00, 17.62it/s]


{'ner': 61.8519214272052}


100%|██████████| 31/31 [00:01<00:00, 17.44it/s]


{'ner': 69.30066068154497}


100%|██████████| 31/31 [00:01<00:00, 17.11it/s]


{'ner': 83.70180038545729}


100%|██████████| 31/31 [00:01<00:00, 17.10it/s]


{'ner': 76.28888842519024}


100%|██████████| 31/31 [00:01<00:00, 17.78it/s]


{'ner': 55.0900054900595}


100%|██████████| 31/31 [00:01<00:00, 17.18it/s]


{'ner': 58.359025911666656}


100%|██████████| 31/31 [00:01<00:00, 16.95it/s]


{'ner': 70.33018293597927}


100%|██████████| 31/31 [00:01<00:00, 17.02it/s]


{'ner': 57.49857404369167}


100%|██████████| 31/31 [00:01<00:00, 17.25it/s]


{'ner': 76.52820332063408}


100%|██████████| 31/31 [00:01<00:00, 18.19it/s]


{'ner': 67.08280859121086}


100%|██████████| 31/31 [00:01<00:00, 17.18it/s]


{'ner': 67.73138803380542}


100%|██████████| 31/31 [00:01<00:00, 17.62it/s]


{'ner': 40.037439592704736}


100%|██████████| 31/31 [00:01<00:00, 16.77it/s]


{'ner': 55.20963857766617}


100%|██████████| 31/31 [00:01<00:00, 16.76it/s]


{'ner': 40.64173234070166}


100%|██████████| 31/31 [00:01<00:00, 16.67it/s]


{'ner': 54.51202012139417}


100%|██████████| 31/31 [00:02<00:00, 15.45it/s]


{'ner': 45.22875839571336}


100%|██████████| 31/31 [00:02<00:00, 11.07it/s]


{'ner': 68.55228415174996}


100%|██████████| 31/31 [00:01<00:00, 16.74it/s]


{'ner': 54.14270300810058}


100%|██████████| 31/31 [00:01<00:00, 17.34it/s]


{'ner': 46.73296137870115}


100%|██████████| 31/31 [00:01<00:00, 17.03it/s]


{'ner': 37.89997874299135}


100%|██████████| 31/31 [00:01<00:00, 16.31it/s]


{'ner': 38.87583665777869}


100%|██████████| 31/31 [00:01<00:00, 16.74it/s]


{'ner': 45.14650646069094}


100%|██████████| 31/31 [00:01<00:00, 16.99it/s]


{'ner': 43.30093049616621}


100%|██████████| 31/31 [00:02<00:00, 13.06it/s]


{'ner': 42.37637426584144}


100%|██████████| 31/31 [00:02<00:00, 12.67it/s]


{'ner': 34.30052278968459}


100%|██████████| 31/31 [00:01<00:00, 16.97it/s]


{'ner': 39.42431048744938}


100%|██████████| 31/31 [00:01<00:00, 16.90it/s]


{'ner': 34.73513999262922}


100%|██████████| 31/31 [00:01<00:00, 16.21it/s]


{'ner': 48.826881785204314}


100%|██████████| 31/31 [00:02<00:00, 13.24it/s]


{'ner': 50.38261038287482}


100%|██████████| 31/31 [00:02<00:00, 12.24it/s]


{'ner': 43.489847326828865}


100%|██████████| 31/31 [00:02<00:00, 11.69it/s]


{'ner': 42.84927176066932}


100%|██████████| 31/31 [00:02<00:00, 12.09it/s]


{'ner': 49.95885315503507}


100%|██████████| 31/31 [00:01<00:00, 16.75it/s]

{'ner': 42.330493922585774}





In [23]:

if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to model


In [24]:
for text, _ in spacy_data:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('amantadine', 'Medicine'), ('rimantadine', 'Medicine'), ('influenza', 'MedicalCondition'), ('M2 inhibitors', 'Medicine')]
Entities [('bedaquiline', 'Medicine'), ('TB therapy', 'Medicine'), ('bedaquiline', 'Medicine')]
Entities [('severe acute respiratory syndrome', 'MedicalCondition'), ('SARS', 'MedicalCondition'), ('coronavirus', 'Pathogen'), ('SARS', 'MedicalCondition'), ('SARS coronavirus', 'Pathogen'), ('SARS-CoV', 'Pathogen')]
Entities [('bacteria', 'Pathogen'), ('pathogenic bacteria', 'Pathogen'), ('Streptococcus', 'Pathogen'), ('Pseudomonas', 'Pathogen'), ('foodborne illnesses', 'MedicalCondition'), ('Shigella', 'Pathogen'), ('Campylobacter', 'Pathogen'), ('Salmonella', 'Pathogen'), ('tetanus', 'MedicalCondition'), ('typhoid fever', 'MedicalCondition'), ('diphtheria', 'MedicalCondition'), ('syphilis', 'MedicalCondition'), ("Hansen's disease", 'MedicalCondition')]
Entities [('herpes simplex virus', 'Pathogen'), ('Yersinia pestis.[155', 'Pathogen')]
Entities [('bismuth 

#### Load trained model

In [26]:
nlp1 = spacy.load(r"model") #load the best model
     

In [28]:
doc = nlp1("Mrs. Smith was prescribed 500mg of amoxicillin three times a day for her bacterial infection.") # input sample text

print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

Entities [('amoxicillin', 'Medicine')]


In [30]:
text = str(input("Enter the text to extract the entities: "))

In [31]:
text

'ghhaf akdhf'