# nlp.vocab.strings

In [2]:
# Import spaCy
import spacy

# Load the small English pipeline
nlp = spacy.load("en_core_web_sm")

What's the hash code of `coffee` in the pretrained model?

In [3]:
nlp.vocab.strings["coffee"]

3197928453018144401

Is it revertable?

In [4]:
nlp.vocab.strings[3197928453018144401]

'coffee'

# Lexemes

In [5]:
lexeme = nlp.vocab['coffee']

In [6]:
lexeme.is_alpha

True

In [7]:
lexeme.text

'coffee'

orth returns the hash code of the lexeme

In [8]:
lexeme.orth

3197928453018144401

In [9]:
import spacy

# Create an English and German nlp object
nlp = spacy.blank("en")
nlp_de = spacy.blank("de")

# Get the ID for the string 'Bowie'
bowie_id = nlp.vocab.strings["Bowie"]
print(bowie_id)

2644858412616767388


German does not have `Bowie`

In [10]:
nlp_de.vocab.strings[bowie_id]

KeyError: "[E018] Can't retrieve string for hash '2644858412616767388'. This usually refers to an issue with the `Vocab` or `StringStore`."

add a new word to the vocabulary of the language

In [11]:
nlp_de.vocab.strings.add('Bowie')

2644858412616767388

now it works

In [12]:
nlp_de.vocab.strings[bowie_id]

'Bowie'

# create doc manually

In [13]:
import spacy

nlp = spacy.blank("en")

# Import the Doc class
from spacy.tokens import Doc

# Desired text: "Go, get started!"
words = ["Go", ",", "get", "started", "!"]
# specifies if there is a space  after letter
spaces = [False, True, True, False, False] 

# Create a Doc from the words and spaces
doc = Doc(nlp.vocab, words=words, spaces=spaces)
print(doc.text)

Go, get started!


example: analyze a text and collect all proper nouns that are followed by a verb

In [14]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Get all tokens and part-of-speech tags
token_texts = [token.text for token in doc]
pos_tags = [token.pos_ for token in doc]

for index, pos in enumerate(pos_tags):
    # Check if the current token is a proper noun
    if pos == "PROPN":
        # Check if the next token is a verb
        if pos_tags[index + 1] == "VERB":
            result = token_texts[index]
            print("Found proper noun before a verb:", result)

Found proper noun before a verb: Berlin


In [15]:
token_texts

['Berlin', 'looks', 'like', 'a', 'nice', 'city']

In [16]:
pos_tags

['PROPN', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN']

In [17]:
[token.tag_ for token in doc]

['NNP', 'VBZ', 'IN', 'DT', 'JJ', 'NN']

In [18]:
[(token.tag_, spacy.explain(token.tag_)) for token in doc]

[('NNP', 'noun, proper singular'),
 ('VBZ', 'verb, 3rd person singular present'),
 ('IN', 'conjunction, subordinating or preposition'),
 ('DT', 'determiner'),
 ('JJ', 'adjective (English), other noun-modifier (Chinese)'),
 ('NN', 'noun, singular or mass')]