# Entity Extraction using spacy.

https://www.machinelearningplus.com/spacy-tutorial-nlp/

https://www.machinelearningplus.com/nlp/training-custom-ner-model-in-spacy/

https://github.com/susanli2016/NLP-with-Python/blob/master/NER_NLTK_Spacy.ipynb

https://github.com/akshayashokbhor/Custom-Trained-Named-entity-recognizer-using-Spacy-library-for-resume-Data-extraction/blob/main/resume_auto_ML.ipynb

https://akshay-bhor.medium.com/custom-trained-named-entity-recognizer-using-spacy-library-for-resume-data-extraction-d419cfd3fba0

https://manivannan-ai.medium.com/how-to-train-ner-with-custom-training-data-using-spacy-188e0e508c6

https://medium.com/@radu.gheorghe/entity-extraction-with-spacy-234d3d11e3ba

https://github.com/DataTurks-Engg/Entity-Recognition-In-Resumes-SpaCy

https://www.kaggle.com/dattaraj/demo-of-using-custom-ner-model-on-covid-19-dataset

https://github.com/rsreetech/CustomNERwithspaCy/blob/master/CustomNERwithSpacy.ipynb

https://stackoverflow.com/questions/63297351/building-a-custom-named-entity-recognition-with-spacy-using-random-text-as-a-s

https://stackoverflow.com/questions/63297607/improving-the-recall-of-a-custom-named-entity-recognition-ner-in-spacy

https://stackoverflow.com/questions/61266715/how-do-you-build-training-dataset-from-scratch-for-a-custom-multi-class-standfor


In [29]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [75]:
import spacy
from spacy.util import minibatch, compounding
from pathlib import Path
from spacy.training import Example
from spacy import displacy

nlp=spacy.load('en_core_web_sm')

In [84]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

animals = ['cat', 'dog', 'fish', 'bird']

nlp = spacy.load('en_core_web_sm')  # or any other model
patterns = [nlp(animal) for animal in animals]  # process each word to create phrase pattern
matcher = PhraseMatcher(nlp.vocab)
matcher.add('ANIMAL', None, *patterns)  # add patterns to matcher

doc = nlp("These are the 11 best dog breeds, based on factors including health, personality, and overall popularity")
matches = matcher(doc)

for match_id, start, end in matches:
    # create a new Span for each match and use the match_id (ANIMAL) as the label
    span = Span(doc, start, end, label=match_id)
    doc.ents = list(doc.ents) + [span]  # add span to doc.ents

print([(ent.text, ent.label_) for ent in doc.ents])  # [('cat', 'ANIMAL')]

[('11', 'CARDINAL'), ('dog', 'ANIMAL')]


In [89]:
# from spacy.pipeline import EntityRuler
# # Initialize
# ruler = EntityRuler(nlp)
# pattern=[{"label": "WORK_OF_ART", "pattern": "My guide to statistics"}]

# ruler.add_patterns(pattern)

# nlp.add_pipe(ruler)

# doc = nlp(" I recently published my work fanfiction by Dr.X . Right now I'm studying the book of my friend .You should try My guide to statistics for clear concepts.")
# print([(ent.text, ent.label_) for ent in doc.ents])

In [69]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    
print("Pipelines ", nlp.pipe_names)


Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN advcl xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False
Pipelines  ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [70]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print([entity.text, entity.label_])

print("\n")

for ent in doc.ents:
    print([ent.text, ent.start_char, ent.end_char, ent.label_])

print("\n")

print([(i, i.label_, str(spacy.explain(i.label_))) for i in doc.ents])

# [(i.pos_, i.tag_) for i in doc] 

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'be', 'talk', 'say']
['Sebastian Thrun', 'PERSON']
['2007', 'DATE']
['American', 'NORP']
['Thrun', 'PERSON']
['Recode', 'PERSON']
['earlier this week', 'DATE']


['Sebastian Thrun', 5, 20, 'PERSON']
['2007', 71, 75, 'DATE']
['American', 173, 181, 'NORP']
['Thrun', 271, 276, 'PERSON']
['Recode', 299, 305, 'PERSON']
['earlier this week', 306, 323, 'DATE']


[(Sebastian Thrun, 'PERSON', 'People, including fictional'), (2007, 'DATE', 'Absolute or relative dates or periods'), (American, 'NORP', 'Nationalities or religious or political groups'), (Thrun, 'PERSON', 'People, including fictional'), (Recode, 'PERSON', 'People, including fictional'), (earlier this week, 'DATE', 'Absolute or relative dates or periods')]


In [None]:
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

displacy.serve(doc, style="ent")

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']





Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...



In [82]:
ner=nlp.get_pipe("ner")
TRAIN_DATA=[('ABC is a worldwide organization',{'entities':[(0,3,'CRORG')]}),
           ('we stand with ABC',{'entities':[(24,26,'CRORG')]}),
           ('we supports ABC',{'entities':[(15,17,'CRORG')]})]
ner.add_label('CRORG')
# Disable pipeline components that dont need to change
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

with nlp.disable_pipes(*unaffected_pipes):
    for iteration in range(30):
        random.shuffle(TRAIN_DATA)
        for raw_text,entity_offsets in TRAIN_DATA:
            doc=nlp.make_doc(raw_text)
            nlp.update([Example.from_dict(doc,entity_offsets)])
            
test_data = "i am working in ABC as senior data science"
doc=nlp(test_data)
for ent in doc.ents:
    print([ent.text,ent.label_])
    
# print(nlp.pipe_names)
displacy.serve(doc, style="ent")

['ABC', 'CRORG']



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [61]:
# Performing NER on E-commerce article

article_text="""India that previously comprised only a handful of players in the e-commerce space, is now home to many biggies and giants battling out with each other to reach the top. This is thanks to the overwhelming internet and smartphone penetration coupled with the ever-increasing digital adoption across the country. These new-age innovations not only gave emerging startups a unique platform to deliver seamless shopping experiences but also provided brick and mortar stores with a level-playing field to begin their online journeys without leaving their offline legacies.
In the wake of so many players coming together on one platform, the Indian e-commerce market is envisioned to reach USD 84 billion in 2021 from USD 24 billion in 2017. Further, with the rate at which internet penetration is increasing, we can expect more and more international retailers coming to India in addition to a large pool of new startups. This, in turn, will provide a major Philip to the organized retail market and boost its share from 12% in 2017 to 22-25% by 2021. 
Here’s a view to the e-commerce giants that are dominating India’s online shopping space:
Amazon – One of the uncontested global leaders, Amazon started its journey as a simple online bookstore that gradually expanded its reach to provide a large suite of diversified products including media, furniture, food, and electronics, among others. And now with the launch of Amazon Prime and Amazon Music Limited, it has taken customer experience to a godly level, which will remain undefeatable for a very long time. 
Flipkart – Founded in 2007, Flipkart is recognized as the national leader in the Indian e-commerce market. Just like Amazon, it started operating by selling books and then entered other categories such as electronics, fashion, and lifestyle, mobile phones, etc. And now that it has been acquired by Walmart, one of the largest leading platforms of e-commerce in the US, it has also raised its bar of customer offerings in all aspects and giving huge competition to Amazon. 
Snapdeal – Started as a daily deals platform in 2010, Snapdeal became a full-fledged online marketplace in 2011 comprising more than 3 lac sellers across India. The platform offers over 30 million products across 800+ diverse categories from over 125,000 regional, national, and international brands and retailers. The Indian e-commerce firm follows a robust strategy to stay at the forefront of innovation and deliver seamless customer offerings to its wide customer base. It has shown great potential for recovery in recent years despite losing Freecharge and Unicommerce. 
ShopClues – Another renowned name in the Indian e-commerce industry, ShopClues was founded in July 2011. It’s a Gurugram based company having a current valuation of INR 1.1 billion and is backed by prominent names including Nexus Venture Partners, Tiger Global, and Helion Ventures as its major investors. Presently, the platform comprises more than 5 lac sellers selling products in nine different categories such as computers, cameras, mobiles, etc. 
Paytm Mall – To compete with the existing e-commerce giants, Paytm, an online payment system has also launched its online marketplace – Paytm Mall, which offers a wide array of products ranging from men and women fashion to groceries and cosmetics, electronics and home products, and many more. The unique thing about this platform is that it serves as a medium for third parties to sell their products directly through the widely-known app – Paytm. 
Reliance Retail – Given Reliance Jio’s disruptive venture in the Indian telecom space along with a solid market presence of Reliance, it is no wonder that Reliance will soon be foraying into retail space. As of now, it has plans to build an e-commerce space that will be established on online-to-offline market program and aim to bring local merchants on board to help them boost their sales and compete with the existing industry leaders. 
Big Basket – India’s biggest online supermarket, Big Basket provides a wide variety of imported and gourmet products through two types of delivery services – express delivery and slotted delivery. It also offers pre-cut fruits along with a long list of beverages including fresh juices, cold drinks, hot teas, etc. Moreover, it not only provides farm-fresh products but also ensures that the farmer gets better prices. 
Grofers – One of the leading e-commerce players in the grocery segment, Grofers started its operations in 2013 and has reached overwhelming heights in the last 5 years. Its wide range of products includes atta, milk, oil, daily need products, vegetables, dairy products, juices, beverages, among others. With its growing reach across India, it has become one of the favorite supermarkets for Indian consumers who want to shop grocery items from the comforts of their homes. 
Digital Mall of Asia – Going live in 2020, Digital Mall of Asia is a very unique concept coined by the founders of Yokeasia Malls. It is designed to provide an immersive digital space equipped with multiple visual and sensory elements to sellers and shoppers. It will also give retailers exclusive rights to sell a particular product category or brand in their respective cities. What makes it unique is its zero-commission model enabling retailers to pay only a fixed amount of monthly rental instead of paying commissions. With its one-of-a-kind features, DMA is expected to bring
never-seen transformation to the current e-commerce ecosystem while addressing all the existing e-commerce worries such as counterfeiting. """

doc=nlp(article_text)
# for ent in doc.ents:
#     print([ent.text,ent.label_]) # Mistagged for flipkart, paytm etc

In [48]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [40]:
from spacy.tokens import Doc
from spacy.training import Example

nlp = spacy.load('en_core_web_sm')
doc = nlp(u"#bbuzz 2016: Rafał Kuć - Running High Performance And Fault Tolerant Elasticsearch")
for entity in doc.ents:
    print(entity.label_, ' | ', entity.text)
    
#  #bbuzz isn’t money. And I doubt that Rafał was Running High while giving that presentation

nlp = spacy.blank('en') # new, empty model. Let's say it's for the English language

nlp.vocab.vectors.name = 'example_model_training' # give a name to our list of vectors

# add NER pipeline
ner = nlp.create_pipe('ner') # our pipeline would just do NER

nlp.add_pipe('ner', last=True) # we add the pipeline to the model

DATA = [
  (u"Search Analytics: Business Value & BigData NoSQL Backend, Otis Gospodnetic ", {'entities': [(58,75,'PERSON')]}),
  (u"Introduction to Elasticsearch by Radu ", {'entities': [(16,29,'TECH'), (32, 36, 'PERSON')]}),
  # ...
]

ner.add_label('PERSON')
ner.add_label('TECH')
# # ...

optimizer = nlp.begin_training()

# nlp.update([text], [annotations], sgd = optimizer)

for i in range(20):
    random.shuffle(DATA)
    for text, annotations in DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, {"entities": annotations})
        nlp.update([example], sgd=optimizer)
#         nlp.update([text], [annotations], sgd=optimizer)
        

doc = nlp(u"#bbuzz 2016: Rafał Kuć - Running High Performance And Fault Tolerant Elasticsearch")
for entity in doc.ents:
    print(entity.label_, ' | ', entity.text)

DATE  |  2016
WORK_OF_ART  |  Rafał Kuć - Running High Performance


ValueError: [E973] Unexpected type for NER data