In [7]:
from spacy.lang.en import English
from spacy.tokens import Doc
from spacy.tokens import Span

In [8]:
nlp = English()

In [9]:
# Words and spaces list to create doc object
# spaces list indicates whether a word is followed by a space. This includes for the last word
words = ['Hello', 'world', '!']
spaces = [True, False, False]

In [10]:
# Create a doc object manually using Doc class from spacy.tokens
doc = Doc(nlp.vocab, words=words, spaces=spaces)
doc

Hello world!

In [11]:
# Span is a view of the doc object, using the start index (inclusive) & end index (exclusive)
# Can be created explicitly as below
span = Span(doc, 0, 2)
span

Hello world

In [12]:
# Can be provided a label
span_with_label = Span(doc, 0, 2, label='GREETING')
span_with_label

Hello world

In [14]:
# List of entities, i.e. doc.ents is writable. So arbitrary spans can be used to overwrite the original list
print(doc.ents)
doc.ents = [span_with_label]
doc.ents

()


(Hello world,)

In [15]:
words = ['spaCy', 'is', 'cool', '!']
spaces = [True, True, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

spaCy is cool!


In [17]:
words = ['Go', ',', 'get', 'started', '!']
spaces = [False, True, True, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


In [18]:
words = ['Oh', ',', 'really', '?', '!']
spaces = [False, True, False, False, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Oh, really?!


In [19]:
# Manually add entties
words = ['I', 'like', 'David', 'Bowie']
spaces = [True, True, True, False]

doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

span = Span(doc, 2, 4, label='PERSON')
print(span.text, span.label)

doc.ents = [span]
print([(ent.text, ent.label_) for ent in doc.ents])

I like David Bowie
David Bowie 380
[('David Bowie', 'PERSON')]
