#TOKENIZATION IN SPACY


In [1]:
# Defining string
text= "I'm working as a Data Scientist in the U.S. and earning $140,000."

In [2]:
# Whitespace Tokenizer
print(text.split())

["I'm", 'working', 'as', 'a', 'Data', 'Scientist', 'in', 'the', 'U.S.', 'and', 'earning', '$140,000.']


In [None]:
import spacy

Models in spaCy for English Language as of release 2.3.1:
- **en_core_web_sm:** 11MB
- **en_core_web_md:** 48MB
- **en_core_web_lg:** 746MB

In [None]:
# Loading model
nlp=spacy.load('en_core_web_sm')

In [None]:
# creating Doc object
doc=nlp(text)

In [None]:
doc

I'm working as a Data Scientist in the U.S. and earning $140,000.

In [None]:
type(doc)

spacy.tokens.doc.Doc

In [None]:
# spaCy Tokenizer
print([token for token in doc])

[I, 'm, working, as, a, Data, Scientist, in, the, U.S., and, earning, $, 140,000, .]


In [None]:
# Type of token
[type(token) for token in doc]

[spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token,
 spacy.tokens.token.Token]

In [None]:
# Tokens as string
print([token.text for token in doc])

['I', "'m", 'working', 'as', 'a', 'Data', 'Scientist', 'in', 'the', 'U.S.', 'and', 'earning', '$', '140,000', '.']


In [None]:
# Comparing Whitespace and spaCy Tokenizer

# Whitespace Tokenizer
print(text.split())

# spaCy Tokenizer
print([token.text for token in doc])

["I'm", 'working', 'as', 'a', 'Data', 'Scientist', 'in', 'the', 'U.S.', 'and', 'earning', '$140,000.']
['I', "'m", 'working', 'as', 'a', 'Data', 'Scientist', 'in', 'the', 'U.S.', 'and', 'earning', '$', '140,000', '.']


In [None]:
# No. of Tokens in Whitespace Tokenizer
print(len(text.split()))

# No. of Tokens in spaCy's Tokenizer
print(len(doc))

12
15


In [None]:
text="My phone has a 20% battery left."

# Whitespace Tokenizer
print(text.split())

# spaCy Tokenizer
print([token.text for token in nlp(text)])

['My', 'phone', 'has', 'a', '20%', 'battery', 'left.']
['My', 'phone', 'has', 'a', '20', '%', 'battery', 'left', '.']


In [None]:
text="I ran 10km today."

# Whitespace Tokenizer
print(text.split())

# spaCy Tokenizer
print([token.text for token in nlp(text)])

['I', 'ran', '10km', 'today.']
['I', 'ran', '10', 'km', 'today', '.']


In [None]:
text="I know HTML, CSS, JavaScript, C++, and Node.js."

# Whitespace Tokenizer
print(text.split())

# spaCy Tokenizer
print([token.text for token in nlp(text)])

['I', 'know', 'HTML,', 'CSS,', 'JavaScript,', 'C++,', 'and', 'Node.js.']
['I', 'know', 'HTML', ',', 'CSS', ',', 'JavaScript', ',', 'C++', ',', 'and', 'Node.js', '.']


# Lemmatization in spaCy

In [None]:
# Defining the string
text="The sky is clear and the stars are twinkling."

In [None]:
import spacy

In [None]:
# Loading spacy model
nlp=spacy.load('en_core_web_sm')

In [None]:
# Creating doc object
doc=nlp(text)

In [None]:
# Lemmatizing the text
[(token.text,token.lemma_) for token in doc]

[('The', 'the'),
 ('sky', 'sky'),
 ('is', 'be'),
 ('clear', 'clear'),
 ('and', 'and'),
 ('the', 'the'),
 ('stars', 'star'),
 ('are', 'be'),
 ('twinkling', 'twinkle'),
 ('.', '.')]

In [None]:
text="The moon looks beautiful at night. it's hard to resist its beauty."

# Creating doc object
doc=nlp(text)

# Lemmatizing the text
[(token.text,token.lemma_) for token in doc]


'moon'