In [1]:
import spacy 
# some powerful stuff

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
doc = nlp("This is a sentence.")
print([(w.text, w.pos_) for w in doc])

[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [4]:
# rest is from https://colab.research.google.com/github/futuremojo/nlp-demystified/blob/main/notebooks/nlpdemystified_preprocessing.ipynb#scrollTo=BIoEJZ-IkHQ4
s = "He didn't want to pay $20 for this book."
doc = nlp(s)
print([t.text for t in doc])

['He', 'did', "n't", 'want', 'to', 'pay', '$', '20', 'for', 'this', 'book', '.']


In [5]:
s = "He told Dr. Cook that he was done with the tests and would cook the results shortly."
doc = nlp(s)
print([t.lower_ if not t.is_sent_start else t for t in doc]) # might mess up if Cook and cook are diff things, but would save vocab size

[He, 'told', 'dr.', 'cook', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'cook', 'the', 'results', 'shortly', '.']


In [6]:
# spaCy's default stop word list.
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))
print([t for t in doc if not t.is_stop])

{'ourselves', 'those', 'really', 'thereby', 'without', 'between', 'n’t', 'few', 'where', 'when', 'six', "'d", 'go', 'anyway', 'eight', 'may', 'show', 'to', 'were', 'again', 'therefore', 'beyond', 'toward', 'former', 'often', 'each', 'its', 'throughout', 'whereupon', 'formerly', 'take', 'please', 're', 'has', 'least', 'besides', 'whether', 'via', '’re', 'several', 'move', 'who', 'more', 'can', '’s', 'quite', 'third', '‘m', 'therein', 'whereafter', 'anyhow', 'because', 'herein', 'among', '’d', 'part', 'never', 'though', 'unless', 'such', 'there', 'during', 'once', 'along', 'over', 'he', 'both', 'side', 'you', 'meanwhile', 'above', 'from', 'would', 'done', '‘ve', '‘s', 'first', 'twelve', 'except', 'amongst', 'since', 'say', 'whence', 'could', 'only', 'noone', 'three', 'beside', 'your', 'hereafter', 'alone', 'a', 'nevertheless', 'i', 'itself', 'him', 'wherein', 'through', "'ll", 'very', 'using', 'become', 'their', 'thereafter', 'it', 'did', 'our', 'should', 'anyone', 'twenty', 'used', 'out

In [7]:
print(doc[0])
print(type(doc[0]))
print(doc[0:3])
print(type(doc[0:3]))
print(doc.text)

He
<class 'spacy.tokens.token.Token'>
He told Dr.
<class 'spacy.tokens.span.Span'>
He told Dr. Cook that he was done with the tests and would cook the results shortly.


In [8]:
s = """Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next. First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs."""

doc = nlp(s)

# Look at individual sentences (there should be two 'Span' objects).
print(list(doc.sents))
print(list(doc.sents)[0])


[Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next., First, she tried to look down and make out what 
she was coming to, but it was too dark to see anything; then she looked at 
the sides of the well, and noticed that they were filled with cupboards and 
book-shelves; here and there she saw maps and pictures hung upon pegs.]
Either the well was very deep, or she fell very slowly, for she 
had plenty of time as she went down to look about her and to wonder what 
was going to happen next.


In [9]:
# some excercizes

# get all currencies
# Expected output: "$20".
s = "He didn't want to pay $20 for this book."
doc = nlp(s)
ans = [doc[i].text + doc[i + 1].text for i in range(len(doc) - 1) if doc[i].is_currency and doc[i + 1].like_num]
print(ans)

['$20']


In [10]:
# custom rules
from spacy.symbols import ORTH

nlp = spacy.load("en_core_web_sm")
doc = nlp("gimme that")  # phrase to tokenize
print([w.text for w in doc])  # ['gimme', 'that']

# Add special case rule
special_case = [{ORTH: "gim"}, {ORTH: "me"}]
nlp.tokenizer.add_special_case("gimme", special_case)

# Check new tokenization
print([w.text for w in nlp("gimme that")])  # ['gim', 'me', 'that']

nlp.tokenizer.add_special_case("...gimme...?", [{"ORTH": "...gimme...?"}])

['gimme', 'that']
['gim', 'me', 'that']
