In [1]:
import spacy 
# some powerful stuff

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [5]:
# rest is from https://colab.research.google.com/github/futuremojo/nlp-demystified/blob/main/notebooks/nlpdemystified_preprocessing.ipynb#scrollTo=BIoEJZ-IkHQ4
s = "He didn't want to pay $20 for this book."
doc = nlp(s)
print([t.text for t in doc])

['He', 'did', "n't", 'want', 'to', 'pay', '$', '20', 'for', 'this', 'book', '.']


In [6]:
print(doc[0])
print(type(doc[0]))
print(doc[0:3])
print(type(doc[0:3]))
print(doc.text)

He
<class 'spacy.tokens.token.Token'>
He didn't
<class 'spacy.tokens.span.Span'>
He didn't want to pay $20 for this book.


In [None]:
s = """Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next. First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs."""

doc = nlp(s)

# Look at individual sentences (there should be two 'Span' objects).
print(list(doc.sents))
print(list(doc.sents)[0])


[Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next., First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs.]


In [None]:
# some excercizes

# get all currencies
# Expected output: "$20".
s = "He didn't want to pay $20 for this book."
doc = nlp(s)
ans = [doc[i].text + doc[i + 1].text for i in range(len(doc) - 1) if doc[i].is_currency and doc[i + 1].like_num]
print(ans)

He didn't want to pay $20 for this book
['$20']


In [None]:
# custom rules
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])

['gimme', 'that']
['gim', 'me', 'that']
