In [123]:
#Named Entity Recognition gives name/entity to a specific word in sentence to make our model classify that particular word correctly into specific category
#it uses spacy

In [124]:
for ent in ner.pipe_labels['ner']:
  print(ent, '=>', spacy.explain(ent))

CARDINAL => Numerals that do not fall under another type
DATE => Absolute or relative dates or periods
EVENT => Named hurricanes, battles, wars, sports events, etc.
FAC => Buildings, airports, highways, bridges, etc.
GPE => Countries, cities, states
LANGUAGE => Any named language
LAW => Named documents made into laws.
LOC => Non-GPE locations, mountain ranges, bodies of water
MONEY => Monetary values, including unit
NORP => Nationalities or religious or political groups
ORDINAL => "first", "second", etc.
ORG => Companies, agencies, institutions, etc.
PERCENT => Percentage, including "%"
PERSON => People, including fictional
PHONE => None
PRODUCT => Objects, vehicles, foods, etc. (not services)
QUANTITY => Measurements, as of weight or distance
TIME => Times smaller than a day
WORK_OF_ART => Titles of books, songs, etc.




In [125]:
import spacy

# Load the small English model
ner = spacy.load('en_core_web_sm')  #inbuilt pipeline

# Process a text
doc = ner("bill gates founded microsoft")

# Print named entities
for ent in doc.ents:
    print(ent.text, ent.label_,spacy.explain(ent.label_))

bill gates PERSON People, including fictional
microsoft ORG Companies, agencies, institutions, etc.


In [126]:
doc.ents[0].label_

'PERSON'

In [127]:
doc.ents[1].label_

'ORG'

In [128]:
from spacy import displacy
displacy.render(doc.ents, style='ent',jupyter=True)

In [129]:
#Ways to build NER on custom entities
#1. Dictionary
#2. Rule based  : EntityRuler===>pattern based
#3. ML : Finetuning of spacy3 model using bert model

# 1. Dictionary

In [130]:
text1 = "we are learning nlp in campusx"
doc=ner(text1)
for entity in doc.ents:
  print(entity.text,entity.label_)

In [131]:
doc[5:6]

campusx

In [132]:
from spacy.tokens import Span
span1=Span(doc, 5,6, label='ORG')
doc.set_ents([span1],default='unmodified')
for entity in doc.ents:
  print(entity.text,'=>',entity.label_)

campusx => ORG


#2. Rule based

In [133]:
from spacy.pipeline import EntityRuler              #pattern based

In [134]:
ner = spacy.load("en_core_web_sm")
ner.pipe_names                           #these are the things inside en core web model/pipeline=> tok2vec tager, parser, etc

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [135]:
#pattern- number

In [136]:
ner = spacy.load("en_core_web_sm")
# pattern define
patterns = [{"label": "PHONE", "pattern" : [{"TEXT" : {"REGEX": "[0-9]{10}"}}]}]
#add entityruler component to pipeline
ruler = ner.add_pipe("entity_ruler", before="ner")
print(ner.pipe_names)           #entityruler will be added before 'ner'
#add pattern in ruler
ruler.add_patterns(patterns)
text = "We are learning ml in campusx and contact is 9876543210"
doc = ner(text)
for entity in doc.ents:
  print(entity.text, '=>',entity.label_)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'entity_ruler', 'ner']
9876543210 => PHONE


In [137]:
#pattern- person,org

In [138]:
ner = spacy.load("en_core_web_sm")
patterns = [{"label": "PERSON", "pattern" : [{"TEXT" : "narayan"}, {"TEXT" : "murty"}]},
            {"label": "ORG", "pattern" : [{"TEXT" : "infosys"}]},
            {"label": "PERSON", "pattern" : [{"TEXT" : "Murty"}]}]
ruler = ner.add_pipe("entity_ruler", before="ner")
]ruler.add_patterns(patterns)
text = "Murty founded infosys"
doc = ner(text)
for entity in doc.ents:
  print(entity.text, '=>',entity.label_,  '=>',spacy.explain(entity.label_))

Murty => PERSON => People, including fictional
infosys => ORG => Companies, agencies, institutions, etc.


In [139]:
#pattern- person,phone,email

In [140]:
ner = spacy.load("en_core_web_sm")
patterns = [{"label": "PHONE", "pattern" : [{"TEXT" : {"REGEX": "[0-9]{10}"}}]},
            {"label": "EMAIL", "pattern" : [{"TEXT" : {"REGEX": "[a-z]@[a-z.]"}}]}]
ruler = ner.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(patterns)
text = "narayan murty founded infosys and his contact is 9876543210 and email is murty@infosys.com"
doc = ner(text)
for entity in doc.ents:
  print(entity.text, '=>',entity.label_,)

narayan murty => PERSON
9876543210 => PHONE
murty@infosys.com => EMAIL


# 3. **FineTuning**



In [None]:
#labelling tools

# prodi.gy --> paid
# https://tecoholic.github.io/ner-annotator/ --> opensource

In [141]:
import spacy
from spacy.tokens import DocBin
db = DocBin()

In [142]:
nlp = spacy.blank('en')   #blank-> blank pipeline nothing in it and language is 'en' i.e english

In [143]:
import json
f = open("/content/annotations.json")
TRAIN_DATA = json.load(f)

In [144]:
TRAIN_DATA

{'classes': ['VIRUS', 'DIESESE'],
 'annotations': [["Coronavirus disease 2019 (COVID-19) is a contagious disease caused by the virus SARS-CoV-2. The first known case was identified in Wuhan, China, in December 2019.[6] The disease quickly spread worldwide, resulting in the COVID-19 pandemic.\r\n\r\nThe symptoms of COVID‑19 are variable but often include fever,[7] cough, headache,[8] fatigue, breathing difficulties, loss of smell, and loss of taste.[9][10][11] Symptoms may begin one to fourteen days after exposure to the virus. At least a third of people who are infected do not develop noticeable symptoms.[12][13] Of those who develop symptoms noticeable enough to be classified as patients, most (81%) develop mild to moderate symptoms (up to mild pneumonia), while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging), and 5% develop critical symptoms (respiratory failure, shock, or multiorgan dysfunction).[14] Older people are at a higher risk of d

In [145]:
for text, annot in TRAIN_DATA["annotations"]:
  doc = nlp.make_doc(text)
  ent = []
  for start, end, label in annot['entities']:
    span = doc.char_span(start, end, label, alignment_mode='contract')   #contract will remove start space or end space
    if span is None:
      print('none')
    else:
      ent.append(span)
    doc.ents = ent
    db.add(doc)
db.to_disk("training_data.spacy")

none
none


In [146]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency

[38;5;3m⚠ To generate a more effective transformer-based config (GPU-only),
install the spacy-transformers package and re-run this command. The config
generated now does not use transformers.[0m
[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [147]:
!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    166.70    0.00    0.00    0.00    0.00
 15     200       5211.11   4614.28   77.14   83.08   72.00    0.77
 30     400         25.14    818.85   72.44   88.46   61.33    0.72
 46     600         27.61    792.73   77.14   83.08   72.00    0.77
 61     800         32.84    792.10   77.14   83.08   72.00    0.77
 76    1000         37.06    798.88   77.14   83.08   72.00    0.77
 92    1200         36.49    780.57   77.14   83.08   72.00    0.77
107    1400         57.02    799.80   77.14   83.08   72.00    0.77
123    1600         59.25    781.63   78.43   76.92   80.00    0.78
138    1800         58.97    764.81   77.14   83.08

In [148]:
!python -m spacy train --help

[1m                                                                                                    [0m
[1m [0m[1;33mUsage: [0m[1mpython [0m[1;32m-m[0m[1m spacy train [OPTIONS] CONFIG_PATH[0m[1m                                                [0m[1m [0m
[1m                                                                                                    [0m
 Train or update a spaCy pipeline. Requires data in spaCy's binary format. To convert data from     
 other formats, use the `spacy convert` command. The config file includes all settings and          
 hyperparameters used during training. To override settings in the config, e.g. settings that point 
 to local paths or that you want to experiment with, you can override them as command line options. 
 For instance, [1;36m-[0m[1;36m-training[0m.batch_size 128 overrides the value of "batch_size" in the block           
 "[training]". The [1;36m-[0m[1;36m-code[0m argument lets you pass in a Python file that

In [142]:
##making inferences/predictions on trained model

In [149]:
trained_ner = spacy.load("/content/model-best")            #this is our trained pipeline not web-core-sm one we will load ours

In [154]:
text = "covid is disease."
doc = trained_ner(text)
doc.ents

()

# Transformers (doing NER using transformers)

In [155]:
!pip install transformers



In [157]:
from transformers import pipeline
ner_transformer = pipeline(model="dslim/bert-base-NER-uncased")

Some weights of the model checkpoint at dslim/bert-base-NER-uncased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [158]:
text = "bill gates founded microsoft"
ner_transformer(text)

[{'entity': 'B-PER',
  'score': 0.99582714,
  'index': 1,
  'word': 'bill',
  'start': 0,
  'end': 4},
 {'entity': 'I-PER',
  'score': 0.99281466,
  'index': 2,
  'word': 'gates',
  'start': 5,
  'end': 10},
 {'entity': 'B-ORG',
  'score': 0.98988754,
  'index': 4,
  'word': 'microsoft',
  'start': 19,
  'end': 28}]

In [None]:
#tagging format IOB
#IOB format :
#I --> inside
#O --> outside
#B --> beginning

In [159]:
text = "narayan murthy founded infosys"
ner_transformer(text)

[{'entity': 'B-PER',
  'score': 0.99548846,
  'index': 1,
  'word': 'narayan',
  'start': 0,
  'end': 7},
 {'entity': 'I-PER',
  'score': 0.9973943,
  'index': 2,
  'word': 'mu',
  'start': 8,
  'end': 10},
 {'entity': 'I-PER',
  'score': 0.984729,
  'index': 3,
  'word': '##rth',
  'start': 10,
  'end': 13},
 {'entity': 'I-PER',
  'score': 0.98603475,
  'index': 4,
  'word': '##y',
  'start': 13,
  'end': 14},
 {'entity': 'B-ORG',
  'score': 0.99705446,
  'index': 6,
  'word': 'info',
  'start': 23,
  'end': 27},
 {'entity': 'I-ORG',
  'score': 0.9965019,
  'index': 7,
  'word': '##sy',
  'start': 27,
  'end': 29},
 {'entity': 'I-ORG',
  'score': 0.9972197,
  'index': 8,
  'word': '##s',
  'start': 29,
  'end': 30}]