In [1]:
#import and load spacy
import spacy
nlp=spacy.load('en_core_web_sm')

#for NER visualization
from spacy import displacy

import pandas as pd
import re

In [2]:
#check if spacy has ner
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [3]:
#text with details of tokyo olympics
text_olympics = """ The 2020 Summer Olympics (Japanese: 2020年夏季オリンピック, Hepburn: Nisen Nijū-nen Kaki Orinpikku), officially the Games of the XXXII Olympiad (第三十二回オリンピック競技大会, Dai Sanjūni-kai Orinpikku Kyōgi Taikai) and branded as Tokyo 2020 (東京2020), was an international multi-sport event held from 23 July to 8 August 2021 in Tokyo, Japan, with some preliminary events that began on 21 July.

Tokyo was selected as the host city during the 125th IOC Session in Buenos Aires, Argentina, on 7 September 2013.[2] Originally scheduled to take place from 24 July to 9 August 2020, the event was postponed to 2021 in March 2020 as a result of the COVID-19 pandemic, the first such instance in the history of the Olympic Games
 """

In [4]:
#ner on tokyo olympics text
doc=nlp(text_olympics)
for ent in doc.ents:
  print(ent.text,ent.label_)

The 2020 Summer Olympics WORK_OF_ART
Japanese NORP
2020年夏季オリンピック CARDINAL
Hepburn PERSON
Nisen Nijū-nen PERSON
Kaki Orinpikku PERSON
the Games of the XXXII Olympiad ( EVENT
Dai Sanjūni-kai PERSON
Orinpikku Kyōgi Taikai PERSON
Tokyo GPE
2020 DATE
23 July to 8 August 2021 DATE
Tokyo GPE
Japan GPE
21 July DATE
Tokyo GPE
the 125th IOC Session FAC
Buenos Aires GPE
Argentina GPE
7 September 2013.[2 DATE
24 July to 9 August 2020 DATE
2021 DATE
March 2020 DATE
first ORDINAL
the Olympic Games
  EVENT


In [5]:
#description of entities
spacy.explain("EVENT")

'Named hurricanes, battles, wars, sports events, etc.'

In [6]:
#visualizing entities on the text
displacy.render(doc,style="ent",jupyter=True)

In [7]:
text_disease = """Based on the statistics from WHO and the Centers for Disease Control and Prevention,here are the five most common infectious diseases.
According to current statistics, hepatitis B is the most common infectious disease in the world, affecting some 2 billion people -- that's more than one-quarter of the world's population. This disease, which is characterized by an inflammation of the liver that leads to jaundice, nausea, and fatigue, can lead to long-term complications such as cirrhosis of the liver or even liver cancer. The concern is primarily for those who carry the chronic form of the disease, which is estimated to be about 350 million people.Malaria, a mosquito-borne disease that tends to affect children the most in tropical and subtropical climates, affects more than 500 million people annually and results in anywhere between 1 million and 3 million deaths. Behind hepatitis B, it appears to be the second most-common infectious disease, and it certainly is one of the most deadly on an annual basis.
Malaria, a mosquito-borne disease that tends to affect children the most in tropical and subtropical climates, affects more than 500 million people annually and results in anywhere between 1 million and 3 million deaths. Behind hepatitis B, it appears to be the second most-common infectious disease, and it certainly is one of the most deadly on an annual basis.
Hepatitis C is a less common and less severe form of hepatitis, but it almost always develops into a chronic, not acute, condition, unlike hepatitis B. Although only 3 million to 4 million new cases are reported each year, some 180 million people worldwide suffer from this chronic condition, which can lead to liver cancer or cirrhosis of the liver over time.
It's at times like these that we curse mosquitoes, because a very specific type of mosquito (Aedes aegypti) is responsible for the transmission of dengue to approximately 50 million people each year. Dengue is most common in Africa and Asia and thankfully occurs in only mild to moderate forms, which can cause high fever, severe headaches, and joint and muscle pain, but rarely leads to the death of the infected patient.
As I mentioned previously, estimating new and ongoing cases for some of these diseases can be downright difficult, and perhaps none more so than tuberculosis. TB is caused by a bacteria found in the lungs that can cause chest pain and a bad cough, as well as lead to a number of other nasty side effects. According to WHO, it's also the second-leading global killer behind AIDS as a single infectious agent.The majority of TB-associated deaths (95%) occur in low- to middle-income countries where TB awareness and prevention simply aren't where they need to be. The good news is that TB death rates on a global basis are falling; however, there were still 8.6 million new cases of TB reported last year, and roughly one-third of the world's population carries a latent form of TB,
meaning they've been infected but aren't ill and can't transmit the disease yet. """

In [8]:
doc=nlp(text_disease)
for ent in doc.ents:
  print(ent.text,ent.label_)

WHO ORG
the Centers for Disease Control and Prevention ORG
five CARDINAL
some 2 billion CARDINAL
more than one-quarter CARDINAL
about 350 million CARDINAL
Malaria GPE
more than 500 million CARDINAL
annually DATE
between 1 million and 3 million QUANTITY
second ORDINAL
annual DATE
Malaria GPE
more than 500 million CARDINAL
annually DATE
between 1 million and 3 million QUANTITY
second ORDINAL
annual DATE
only 3 million to 4 million CARDINAL
each year DATE
some 180 million CARDINAL
mosquito PERSON
approximately 50 million CARDINAL
each year DATE
Dengue ORG
Africa LOC
Asia LOC
TB ORG
second ORDINAL
TB ORG
95% PERCENT
TB ORG
TB ORG
8.6 million CARDINAL
TB ORG
last year DATE
roughly one-third CARDINAL
TB ORG


## Updating NER

In [9]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('en_core_web_sm')

In [10]:
# Getting the pipeline component
ner=nlp.get_pipe("ner")
ner

<spacy.pipeline.ner.EntityRecognizer at 0x2c1156bfca0>

In [11]:
#feed spacy with custom defined entities
TRAIN_DATA = [(text_disease , {"entities": [(654,661, 'DISEASE'),(1890,1896, 'DISEASE'),(2311,2323, 'DISEASE'),
                                            (1382,1391, 'DISEASE'), (406,414, 'DISEASE'), 
                                            (2539,2543, 'DISEASE'), (168,177, 'DISEASE'),
                                           (2325,2327, 'DISEASE'), (518,524, 'DISEASE')]})]
#TRAIN_DATA = [("According to current statistics, malaria and hepatitis are the most common infectious disease in the world, affecting some 2 billion people" , {"entities": [(33,40, "DISEASE"),(45,54, "DISEASE") ]})]

In [12]:
string = text_disease
match = re.search("Malaria", string)
print('%d,%d' % (match.start(), match.end()))

654,661


In [13]:
# Add the new labels to ner
for _, annotations in TRAIN_DATA:
  for ent in annotations.get("entities"):
    ner.add_label(ent[2])

In [14]:
#Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

## Training the NER model

In [15]:
#import necessary libraries for training
# Import requirements
import random
from spacy.util import minibatch, compounding
from spacy.training import Example

In [16]:
# Begin training by disabling other pipeline components
with nlp.disable_pipes(*unaffected_pipes) :

  sizes = compounding(1.0, 4.0, 1.001)
  # Training for 30 iterations     
  for itn in range(100):
    # shuffle examples before training
    random.shuffle(TRAIN_DATA)
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=sizes)
    # ictionary to store losses
    losses = {}
    for batch in batches:
      for text, annotations in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.5, losses=losses)
        print("Losses",losses)

Losses {'ner': 71.16944546145112}
Losses {'ner': 58.741132901598036}
Losses {'ner': 62.10449638205967}
Losses {'ner': 35.57393221243143}
Losses {'ner': 33.836670909181635}
Losses {'ner': 32.30285139325153}
Losses {'ner': 27.586703860583505}
Losses {'ner': 32.13765710394284}
Losses {'ner': 30.334683642074125}
Losses {'ner': 20.841288943407783}
Losses {'ner': 19.04068992610985}
Losses {'ner': 21.125159881830996}
Losses {'ner': 18.979337074870056}
Losses {'ner': 20.394084833692773}
Losses {'ner': 20.25169690113944}
Losses {'ner': 18.075847051909705}
Losses {'ner': 18.610023264074698}
Losses {'ner': 19.909433807744335}
Losses {'ner': 18.160041433617153}
Losses {'ner': 18.045635228239917}
Losses {'ner': 17.20359768749404}
Losses {'ner': 17.050407409260515}
Losses {'ner': 17.426241328998003}
Losses {'ner': 17.85346345131984}
Losses {'ner': 17.37776712939376}
Losses {'ner': 16.228256308007985}
Losses {'ner': 16.547508589428617}
Losses {'ner': 15.598951879364904}
Losses {'ner': 16.911050660302

In [17]:
# Testing the NER
doc = nlp("Hepatitis is a disease which causes inflammation of the liver and it can also cause jaundice.Tuberculosis is caused by a bacterium called Mycobacterium tuberculosis.AIDS is the late stage of HIV infection that occurs when the body's immune system is badly damaged because of the virus.Typhoid is a bacterial infection that can lead to a high fever, diarrhea, and vomiting.Cancer is a disease in which some of the body's cells grow uncontrollably and spread to other parts of the body.Chikungunya is a viral disease transmitted to humans by infected mosquitoes.Pneumonia is an infection that inflames the air sacs in one or both lungs.Malaria is a disease caused by a parasite." )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])                   

Entities [('Hepatitis', 'DISEASE'), ('jaundice', 'DISEASE'), ('Tuberculosis', 'DISEASE'), ('AIDS', 'DISEASE'), ('Cancer', 'DISEASE'), ('Chikungunya', 'DISEASE'), ('Pneumonia', 'DISEASE'), ('Malaria', 'DISEASE')]
