In [1]:
%pip install -U pip setuptools wheel
%pip install -U spacy

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")
import en_core_web_sm
nlp = en_core_web_sm.load()
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [3]:
import spacy
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']


['gimme', 'that']
['gim', 'me', 'that']


In [4]:
from spacy.lang.en import English

nlp = English()
text = '''"Let's go!"'''
doc = nlp(text)
tok_exp = nlp.tokenizer.explain(text)
assert [t.text for t in doc if not t.is_space] == [t[1] for t in tok_exp]
for t in tok_exp:
    print(t[1], "\\t", t[0])

" \t PREFIX
Let \t SPECIAL-1
's \t SPECIAL-2
go \t TOKEN
! \t SUFFIX
" \t SUFFIX


In [5]:
import spacy

nlp = spacy.load("en_core_web_sm")
print("Pipeline:", nlp.pipe_names)
doc = nlp("I was reading the paper.")
token = doc[0]  # 'I'
print(token.morph)  # 'Case=Nom|Number=Sing|Person=1|PronType=Prs'
print(token.morph.get("PronType"))  # ['Prs']


Pipeline: ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
Case=Nom|Number=Sing|Person=1|PronType=Prs
['Prs']


In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)
    


Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [7]:
doc = nlp("Wo bist du?") # English: 'Where are you?'
print(doc[2].morph)  # 'Case=Nom|Number=Sing|Person=2|PronType=Prs'
print(doc[2].pos_) # 'PRON'

Number=Sing
PROPN


In [8]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Where are you?")
print(doc[2].morph)  # 'Case=Nom|Person=2|PronType=Prs'
print(doc[2].pos_)  # 'PRON'

Case=Nom|Person=2|PronType=Prs
PRON


In [10]:
import spacy

# English pipelines include a rule-based lemmatizer
nlp = spacy.load("en_core_web_sm")
lemmatizer = nlp.get_pipe("lemmatizer")
print(lemmatizer.mode)  # 'rule'

doc = nlp("I was reading the paper.")
print([token.lemma_ for token in doc])
# ['I', 'be', 'read', 'the', 'paper', '.']

rule
['I', 'be', 'read', 'the', 'paper', '.']


In [11]:
%pip install -U spacy[lookups]

zsh:1: no matches found: spacy[lookups]
Note: you may need to restart the kernel to use updated packages.


In [12]:

import spacy

nlp = spacy.blank("sv")
nlp.add_pipe("lemmatizer", config={"mode": "lookup"})

<spacy.pipeline.lemmatizer.Lemmatizer at 0x3116c1ad0>

In [13]:
# pip install -U spacy[lookups]
import spacy

nlp = spacy.blank("de")
# Morphologizer (note: model is not yet trained!)
nlp.add_pipe("morphologizer")
# Rule-based lemmatizer
nlp.add_pipe("lemmatizer", config={"mode": "rule"})

<spacy.pipeline.lemmatizer.Lemmatizer at 0x3116c3910>

In [14]:
import spacy

nlp = spacy.blank("de")
nlp.add_pipe("trainable_lemmatizer", name="lemmatizer")

<spacy.pipeline.edit_tree_lemmatizer.EditTreeLemmatizer at 0x30c201970>

In [15]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_,
            chunk.root.head.text)

Autonomous cars cars nsubj shift
insurance liability liability dobj shift
manufacturers manufacturers pobj toward


In [16]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

Autonomous amod cars NOUN []
cars nsubj shift VERB [Autonomous]
shift ROOT shift VERB [cars, liability, toward]
insurance compound liability NOUN []
liability dobj shift VERB [insurance]
toward prep shift VERB [manufacturers]
manufacturers pobj toward ADP []


In [17]:
import spacy
from spacy.symbols import nsubj, VERB

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")

# Finding a verb with a subject from below — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)
print(verbs)

{shift}


In [18]:
# Finding a verb with a subject from above — less good
verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break

In [19]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("bright red apples on the tree")
print([token.text for token in doc[2].lefts])  # ['bright', 'red']
print([token.text for token in doc[2].rights])  # ['on']
print(doc[2].n_lefts)  # 2
print(doc[2].n_rights)  # 1

['bright', 'red']
['on']
2
1


In [21]:

doc = nlp("schöne rote Äpfel auf dem Baum")
print([token.text for token in doc[2].lefts])  # ['schöne', 'rote']
print([token.text for token in doc[2].rights])  # ['auf']

[]
[]


In [22]:
doc = nlp("Credit and mortgage account holders must submit their requests")

root = [token for token in doc if token.head == token][0]
subject = list(root.lefts)[0]
for descendant in subject.subtree:
    assert subject is descendant or subject.is_ancestor(descendant)
    print(descendant.text, descendant.dep_, descendant.n_lefts,
            descendant.n_rights,
            [ancestor.text for ancestor in descendant.ancestors])

Credit nmod 0 2 ['account', 'holders', 'submit']
and cc 0 0 ['Credit', 'account', 'holders', 'submit']
mortgage conj 0 0 ['Credit', 'account', 'holders', 'submit']
account compound 1 0 ['holders', 'submit']
holders nsubj 1 0 ['submit']


In [23]:
doc = nlp("Credit and mortgage account holders must submit their requests")
span = doc[doc[4].left_edge.i : doc[4].right_edge.i+1]
with doc.retokenize() as retokenizer:
    retokenizer.merge(span)
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

Credit and mortgage account holders NOUN nsubj submit
must AUX aux submit
submit VERB ROOT submit
their PRON poss requests
requests NOUN dobj submit


In [24]:
# Merge noun phrases and entities for easier analysis
nlp.add_pipe("merge_entities")
nlp.add_pipe("merge_noun_chunks")

TEXTS = [
    "Net income was $9.4 million compared to the prior year of $2.7 million.",
    "Revenue exceeded twelve billion dollars, with a loss of $1b.",
]
for doc in nlp.pipe(TEXTS):
    for token in doc:
        if token.ent_type_ == "MONEY":
            # We have an attribute and direct object, so check for subject
            if token.dep_ in ("attr", "dobj"):
                subj = [w for w in token.head.lefts if w.dep_ == "nsubj"]
                if subj:
                    print(subj[0], "-->", token)
            # We have a prepositional object with a preposition
            elif token.dep_ == "pobj" and token.head.dep_ == "prep":
                print(token.head.head, "-->", token)

Net income --> $9.4 million
the prior year --> $2.7 million
Revenue --> twelve billion dollars
a loss --> 1b


In [26]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Autonomous cars shift insurance liability toward manufacturers")
# Since this is an interactive Jupyter environment, we can use displacy.render here
displacy.render(doc, style='dep')

In [27]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [28]:
doc = nlp("San Francisco considers banning sidewalk delivery robots")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

[('San Francisco', 0, 13, 'GPE')]
['San', 'B', 'GPE']
['Francisco', 'I', 'GPE']


In [31]:
from spacy.tokens import Span


doc = nlp("fb is hiring a new vice president of global policy")
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print('Before', ents)
# The model didn't recognize "fb" as an entity :(

# Create a span for the new entity
fb_ent = Span(doc, 0, 1, label="ORG")
orig_ents = list(doc.ents)

# Option 1: Modify the provided entity spans, leaving the rest unmodified
doc.set_ents([fb_ent], default="unmodified")

# Option 2: Assign a complete list of ents to doc.ents
doc.ents = orig_ents + [fb_ent]

ents = [(e.text, e.start, e.end, e.label_) for e in doc.ents]
print('After', ents)
# [('fb', 0, 1, 'ORG')] 🎉

Before []
After [('fb', 0, 1, 'ORG')]


In [32]:
fb_ent = doc.char_span(0, 2, label="ORG")


In [34]:
import numpy
import spacy
from spacy.attrs import ENT_IOB, ENT_TYPE

doc = nlp.make_doc("London is a big city in the United Kingdom.")
print("Before", doc.ents)  # []

header = [ENT_IOB, ENT_TYPE]
attr_array = numpy.zeros((len(doc), len(header)), dtype="uint64")
attr_array[0, 0] = 3  # B
attr_array[0, 1] = doc.vocab.strings["GPE"]
doc.from_array(header, attr_array)
print("After", doc.ents)  # [London]

Before ()
After (London,)


In [42]:
# Don't forget to install a trained pipeline, e.g.: python -m spacy download en

# In[1]:
import spacy
from spacy import displacy

# In[2]:
doc = nlp("Rats are various medium-sized, long-tailed rodents.")
displacy.render(doc, style="dep")

# In[3]:
doc2 = nlp(LONG_NEWS_ARTICLE)
displacy.render(doc2, style="ent")

NameError: name 'LONG_NEWS_ARTICLE' is not defined

In [44]:
from IPython.core.display import display, HTML

html = displacy.render(doc, style="dep")
display(HTML(html))

  from IPython.core.display import display, HTML
https://spacy.io/usage/models


<IPython.core.display.HTML object>

In [48]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc1 = nlp("This is a sentence.")
doc2 = nlp("This is another sentence.")
html = displacy.render([doc1, doc2], style="dep", page=True)




In [50]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)


Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [53]:

import spacy
nlp = spacy.load("nb_core_news_sm")
import nb_core_news_sm
nlp = nb_core_news_sm.load()
doc = nlp("Dette er en setning.")
print([(w.text, w.pos_) for w in doc])

[('Dette', 'PRON'), ('er', 'AUX'), ('en', 'DET'), ('setning', 'NOUN'), ('.', 'PUNCT')]


In [None]:
"""
Example sentences to test spaCy and its language models.

>>> from spacy.lang.hi.examples import sentences
>>> docs = nlp.pipe(sentences)
"""


sentences = [
    "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
    "स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",
    "सैन फ्रांसिस्को फुटपाथ वितरण रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है।",
    "लंदन यूनाइटेड किंगडम का विशाल शहर है।",
    "आप कहाँ हो?",
    "फ्रांस के राष्ट्रपति कौन हैं?",
    "संयुक्त राज्यों की राजधानी क्या है?",
    "बराक ओबामा का जन्म कब हुआ था?",
    "जवाहरलाल नेहरू भारत के पहले प्रधानमंत्री हैं।",
    "राजेंद्र प्रसाद, भारत के पहले राष्ट्रपति, दो कार्यकाल के लिए कार्यालय रखने वाले एकमात्र व्यक्ति हैं।",
]

In [66]:
from spacy.lang.hi.examples import sentences
docs = nlp.pipe(sentences)
sentences = [
    "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
    "स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",
    "सैन फ्रांसिस्को फुटपाथ वितरण रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है।",
    "लंदन यूनाइटेड किंगडम का विशाल शहर है।",
    "आप कहाँ हो?",
    "फ्रांस के राष्ट्रपति कौन हैं?",
    "संयुक्त राज्यों की राजधानी क्या है?",
    "बराक ओबामा का जन्म कब हुआ था?",
    "जवाहरलाल नेहरू भारत के पहले प्रधानमंत्री हैं।",
    "राजेंद्र प्रसाद, भारत के पहले राष्ट्रपति, दो कार्यकाल के लिए कार्यालय रखने वाले एकमात्र व्यक्ति हैं।",
]
for token in docs:
    print(token)



एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।
स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।
सैन फ्रांसिस्को फुटपाथ वितरण रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है।
लंदन यूनाइटेड किंगडम का विशाल शहर है।
आप कहाँ हो?
फ्रांस के राष्ट्रपति कौन हैं?
संयुक्त राज्यों की राजधानी क्या है?
बराक ओबामा का जन्म कब हुआ था?
जवाहरलाल नेहरू भारत के पहले प्रधानमंत्री हैं।
राजेंद्र प्रसाद, भारत के पहले राष्ट्रपति, दो कार्यकाल के लिए कार्यालय रखने वाले एकमात्र व्यक्ति हैं।


In [63]:
nlp = spacy.blank("hi")

In [59]:
"""
Example sentences to test spaCy and its language models.

>>> from spacy.lang.hi.examples import sentences
>>> docs = nlp.pipe(sentences)
"""


sentences = [
    "एप्पल 1 अरब डॉलर के लिए यू.के. स्टार्टअप खरीदने पर विचार कर रहा है।",
    "स्वायत्त कारें निर्माताओं की ओर बीमा दायित्व रखतीं हैं।",
    "सैन फ्रांसिस्को फुटपाथ वितरण रोबोटों पर प्रतिबंध लगाने का विचार कर रहा है।",
    "लंदन यूनाइटेड किंगडम का विशाल शहर है।",
    "आप कहाँ हो?",
    "फ्रांस के राष्ट्रपति कौन हैं?",
    "संयुक्त राज्यों की राजधानी क्या है?",
    "बराक ओबामा का जन्म कब हुआ था?",
    "जवाहरलाल नेहरू भारत के पहले प्रधानमंत्री हैं।",
    "राजेंद्र प्रसाद, भारत के पहले राष्ट्रपति, दो कार्यकाल के लिए कार्यालय रखने वाले एकमात्र व्यक्ति हैं।",
]

In [52]:
from ...language import BaseDefaults, Language
from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS


class HindiDefaults(BaseDefaults):
    stop_words = STOP_WORDS
    lex_attr_getters = LEX_ATTRS


class Hindi(Language):
    lang = "hi"
    Defaults = HindiDefaults


__all__ = ["Hindi"]

ImportError: attempted relative import with no known parent package