In [45]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [12]:
doc = nlp("hi , I am Aman Agrawal")

In [13]:
print(nlp.pipe_names) # default component on the doc 

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [14]:
print(nlp.pipeline)

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x14dbc3ad0>), ('tagger', <spacy.pipeline.tagger.Tagger object at 0x14d6cec30>), ('parser', <spacy.pipeline.dep_parser.DependencyParser object at 0x14dbca030>), ('attribute_ruler', <spacy.pipeline.attributeruler.AttributeRuler object at 0x14df7df10>), ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer object at 0x14d65b010>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x14dbca260>)]


In [15]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [16]:
from spacy.language import Language

@Language.component("length_component")
def length_function(doc):
    print(f"Length of the token : {len(doc)}")

    return doc 

nlp.add_pipe("length_component" , first = True) 

<function __main__.length_function(doc)>

In [17]:
doc1 = nlp("Hi am studying") # coz custom funnction that we made was for printing.

Length of the token : 3


In [18]:
nlp.pipe_names

['length_component',
 'tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner']

In [23]:
# Revision of mathcer and Phrase Matcher
from spacy.matcher import Matcher , PhraseMatcher

doc = nlp("I love dogs and Golden Retriver are my favs")

matcher = Matcher(nlp.vocab)
matcher.add("DOG_by_Matcher" , [[{"LOWER": "golden"}, {"LOWER": "retriver"}]])

for match_id , start , end in matcher(doc):
    span = doc[start:end]
    print(span.text)

Length of the token : 9
Golden Retriver


In [38]:
matcher = PhraseMatcher(nlp.vocab) # in Phrase matcher instead of dict , doc object is given in argument
matcher.add("DOG_by_PM" , [nlp("golden retriver")])

for match_id , start , end in matcher(doc):
    span = doc[start:end]
    print(span.text)

Length of the token : 2


In [26]:
matcher = PhraseMatcher(nlp.vocab) # in Phrase matcher instead of dict , doc object is given in argument
matcher.add("DOG_by_PM" , [nlp("Golden Retriver")])

for match_id , start , end in matcher(doc):
    span = doc[start:end]
    print(span.text)

Length of the token : 2
Golden Retriver


In [42]:
import json
with open("/Users/0xr4plh/Documents/Machine Learning/my-nlp-basics/spaCy/Code/countries.json" , encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

In [43]:
COUNTRIES[:10]

['Afghanistan',
 'Åland Islands',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda']

In [44]:
nlp.pipe(COUNTRIES)

<generator object Language.pipe at 0x14c557e10>

In [46]:
list(nlp.pipe(COUNTRIES))

[Afghanistan,
 Åland Islands,
 Albania,
 Algeria,
 American Samoa,
 Andorra,
 Angola,
 Anguilla,
 Antarctica,
 Antigua and Barbuda,
 Argentina,
 Armenia,
 Aruba,
 Australia,
 Austria,
 Azerbaijan,
 Bahamas,
 Bahrain,
 Bangladesh,
 Barbados,
 Belarus,
 Belgium,
 Belize,
 Benin,
 Bermuda,
 Bhutan,
 Bolivia (Plurinational State of),
 Bonaire, Sint Eustatius and Saba,
 Bosnia and Herzegovina,
 Botswana,
 Bouvet Island,
 Brazil,
 British Indian Ocean Territory,
 United States Minor Outlying Islands,
 Virgin Islands (British),
 Virgin Islands (U.S.),
 Brunei Darussalam,
 Bulgaria,
 Burkina Faso,
 Burundi,
 Cambodia,
 Cameroon,
 Canada,
 Cabo Verde,
 Cayman Islands,
 Central African Republic,
 Chad,
 Chile,
 China,
 Christmas Island,
 Cocos (Keeling) Islands,
 Colombia,
 Comoros,
 Congo,
 Congo (Democratic Republic of the),
 Cook Islands,
 Costa Rica,
 Croatia,
 Cuba,
 Curaçao,
 Cyprus,
 Czech Republic,
 Denmark,
 Djibouti,
 Dominica,
 Dominican Republic,
 Ecuador,
 Egypt,
 El Salvador,
 Equa

In [50]:
matcher = PhraseMatcher(nlp.vocab)

patterns = list(nlp.pipe(COUNTRIES))
matcher.add("BY_PM_COUNTRIES" , patterns)

doc = nlp("Czech Republic may help Slovakia protect its airspace")

matches = matcher(doc)
print([doc[start:end] for match_id, start, end in matches]) # case sensitive

[Czech Republic, Slovakia]


In [55]:
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

with open("/Users/0xr4plh/Documents/Machine Learning/my-nlp-basics/spaCy/Code/countries.json" , encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("/Users/0xr4plh/Documents/Machine Learning/my-nlp-basics/spaCy/Code/country_text.txt" , encoding="utf8") as f:
    TEXT = f.read()

matcher = PhraseMatcher(nlp.vocab)

matcher.add("match" , list(nlp.pipe(COUNTRIES)))

doc = nlp(TEXT)

doc.ents = []

for match_id , start , end in matcher(doc):
    span = Span(doc , start , end , label = "GPE")
    doc.ents = list(doc.ents) + [span]

    root = span.root
    head = span.root.head
    # print(f"Head of span - {head}")

for token in doc.ents:
    print(token.label_ , token.text)

GPE Namibia
GPE South Africa
GPE Cambodia
GPE Kuwait
GPE Somalia
GPE Haiti
GPE Mozambique
GPE Somalia
GPE Rwanda
GPE Singapore
GPE Sierra Leone
GPE Afghanistan
GPE Iraq
GPE Sudan
GPE Congo
GPE Haiti


In [82]:
# In this exercise, you’ll be writing a custom component that uses the PhraseMatcher to find animal names in the document and adds the matched spans to the doc.ents. A PhraseMatcher with the animal patterns has already been created as the variable matcher.

# Define the custom component and apply the matcher to the doc.
# Create a Span for each match, assign the label ID for "ANIMAL" and overwrite the doc.ents with the new spans.
# Add the new component to the pipeline after the "ner" component.
# Process the text and print the entity text and entity label for the entities in doc.ents.

from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]

matcher = PhraseMatcher(nlp.vocab)
matcher.add("animal_matching" , list(nlp.pipe(animals)))

@Language.component("animal")
def animal_matcher(doc):

    matches = matcher(doc)

    # [list(doc.ents) + [Span(doc , start , end , label = "ANIMAL")] for match_id , start , end in matches] -> incorrect

    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]

    doc.ents = spans

    return doc 

nlp.add_pipe("animal" , after = "ner") # in this it's after ner so , doc.ents got re-initialized and old new data got vanished
print(nlp.pipe_names)

doc = nlp("I have a cat and a Golden Retriever , I like New York city and I work in SV company")
print([(ent.text, ent.label_) for ent in doc.ents])

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'animal']
[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL')]


In [84]:
# added before ner

from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
animals = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"]

matcher = PhraseMatcher(nlp.vocab)
matcher.add("animal_matching" , list(nlp.pipe(animals)))

@Language.component("animal")
def animal_matcher(doc):

    matches = matcher(doc)

    # [list(doc.ents) + [Span(doc , start , end , label = "ANIMAL")] for match_id , start , end in matches] -> incorrect

    spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches]

    doc.ents = spans

    return doc 

nlp.add_pipe("animal" , before = "ner") # in this it's before ner so , so from here doc.ents was made and then sent to ner so now will contain both custom + default ner also

doc = nlp("I have a cat and a Golden Retriever , I like New York city and I work in SV company")
print([(ent.text, ent.label_) for ent in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL'), ('New York', 'GPE'), ('SV', 'GPE')]


In [86]:
# Attributes - Very Important as they provide custom attributes on Tokens , Span and docs - very powerful custom function tools

In [87]:
# Extentions -> Attribute Extention , Property Extention and Method Extention
# Attribute extention -> normal , can be over-written
# Property extention -> Defined via getter function - takes in one argument like token
# Method attribute -> makes callable , can take one or more arguments