In [1]:
# Hashing
import spacy
nlp = spacy.blank("it")
_ = nlp("Lunedì andrò in ufficio in automobile")
word_hash = nlp.vocab.strings["automobile"]
print(word_hash)
print(nlp.vocab.strings[word_hash])

7211811266693931283
automobile


In [2]:
from spacy.tokens import Doc, Span

# The words and spaces to create the doc from
words = ["Hello", "world", "!"]
spaces = [True, False, False]

# Create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)

# Create a span manually
span = Span(doc, 0, 2)

# Create a span with a label
span_with_label = Span(doc, 0, 2, label="GREETING")

# Add span to the doc.ents
doc.ents = [span_with_label]

print(doc.text)
for ent in doc.ents:
    print(ent)

Hello world!
Hello world


In [3]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("Berlin looks like a nice city")

# Get all tokens and part-of-speech tags
for token in doc:
    if token.pos_ == "PROPN":
        next_token = doc[token.i + 1]
        if next_token.pos_ == "VERB":
            print("Found proper noun before a verb:", doc[token.i])

Found proper noun before a verb: Berlin


In [4]:
# Similarity

# Load a larger pipeline with vectors
nlp = spacy.load("en_core_web_md")

doc1 = nlp("Milano looks like a nice city")
doc2 = nlp("Firenze looks like a nice city")
doc3 = nlp("Roma is the larger city")

print(doc1.similarity(doc2))
print(doc1.similarity(doc3))

0.9758854418212424
0.5835631668074686


In [5]:
# Recap rule-based matching
# Initialize with the shared vocab
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# Patterns are lists of dictionaries describing the tokens
pattern = [{"LEMMA": "love", "POS": "VERB"}, {"LOWER": "cats"}]
matcher.add("LOVE_CATS", [pattern])

# Operators can specify how often a token should be matched
pattern = [{"TEXT": "very", "OP": "+"}, {"TEXT": "happy"}]
matcher.add("VERY_HAPPY", [pattern])

# Calling matcher on doc returns list of (match_id, start, end) tuples
doc = nlp("I love cats and I'm very very happy")
matches = matcher(doc)

for m_id, m_start, m_end in matches:
    print(doc[m_start:m_end])

love cats
very happy
very very happy


In [6]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp("Amazon has earned a reputation as a disruptor of well-established industries through technological innovation and 'aggressive' reinvestment of profits into capital expenditures. As of 2023, it is the world's largest online retailer and marketplace, smart speaker provider, cloud computing service through AWS, live-streaming service through Twitch, and Internet company as measured by revenue and market share.")

cloud_pattern = [{"LEMMA": "cloud", "POS": "ADJ"}]
company_pattern = [{"TEXT": "company"}]
scalar_pattern = [{"IS_DIGIT": True}]
matcher.add("CLOUD",[cloud_pattern])
matcher.add("COMPANY", [company_pattern])
matcher.add("DIGIT", [scalar_pattern])

matches = matcher(doc)

for m_id, m_start, m_end in matches:
    print(doc[m_start:m_end])

2023
cloud
company


In [9]:
# Phrases matching
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add("DOG", [pattern])
doc = nlp("I have a Golden Retriever")

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print("Matched span:", span.text)

Matched span: Golden Retriever
