# Tokenization in Spacy

In [1]:
import spacy

In [2]:
# Creating a blank language object and tokenize words in a sentence

nlp = spacy.blank("en")

doc = nlp("Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.")

In [3]:
for token in doc:
    print(token)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


### Using index to grab tokens

In [4]:
doc[-1]

.

In [5]:
doc2 = nlp('''"Let's go to N.Y.!"''')

for token in doc2:
    print(token)

"
Let
's
go
to
N.Y.
!
"


In [6]:
span = doc[1:5]
type(span)

spacy.tokens.span.Span

In [7]:
type(nlp)

spacy.lang.en.English

In [8]:
type(doc)

spacy.tokens.doc.Doc

In [9]:
type(doc2)

spacy.tokens.doc.Doc

In [10]:
type(token)

spacy.tokens.token.Token

In [11]:
doc3 = nlp("Tony gave two $ to Peter.")

### Token Attributes

In [12]:
token0 = doc3[0]
token0

Tony

In [13]:
dir(token0)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [14]:
type(token0)

spacy.tokens.token.Token

In [15]:
token0.like_num

False

In [16]:
token2 = doc3[2]
token2.text

'two'

In [17]:
token2.like_num

True

In [18]:
token3 = doc3[3]
token3.text

'$'

In [19]:
token3.is_currency

True

In [20]:
for token in doc3:
    print(token, '==>', 'index: ', token.i,
          "is_alpha:", token.is_alpha,
          "is_punct:", token.is_punct,
          "like_num:", token.like_num,
          "is_currency:", token.is_currency
         )

Tony ==> index:  0 is_alpha: True is_punct: False like_num: False is_currency: False
gave ==> index:  1 is_alpha: True is_punct: False like_num: False is_currency: False
two ==> index:  2 is_alpha: True is_punct: False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct: False like_num: False is_currency: True
to ==> index:  4 is_alpha: True is_punct: False like_num: False is_currency: False
Peter ==> index:  5 is_alpha: True is_punct: False like_num: False is_currency: False
. ==> index:  6 is_alpha: False is_punct: True like_num: False is_currency: False


### Collecting email ids of students from students information sheet

In [21]:
with open("students.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com\n',
 '\n',
 '\n',
 '\n']

In [22]:
text = ' '.join(text)
text



In [23]:
doc4 = nlp(text)
emails = []

for token in doc4:
    if token.like_email:
        emails.append(token.text)
        
emails

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

### Support in other languages

In [24]:
nlp = spacy.blank("hi")

doc = nlp("भैया जी! 5000 ₹ उधार थे वो वापस देदो")

for token in doc:
    print(token, token.is_currency, token.like_num)

भैया False False
जी False False
! False False
5000 False True
₹ True False
उधार False False
थे False False
वो False False
वापस False False
देदो False False


### Customizing Tokenizer

In [25]:
doc4 = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc4]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [26]:
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
#     {ORTH: 'give'},
    {ORTH: 'gim'},
    {ORTH: 'me'}
])

doc4 = nlp("gimme double cheese extra large healthy pizza")

tokens = [token.text for token in doc4]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [27]:
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x1dc50298080>

In [28]:
nlp.pipe_names

['sentencizer']

In [29]:
doc5 = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")

for sentence in doc5.sents:
    print(sentence)

Dr.
Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [30]:
nlp.pipeline

[('sentencizer', <spacy.pipeline.sentencizer.Sentencizer at 0x1dc50298080>)]

## Exercise

### Collecting dataset websites from a book paragraph

(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf) 

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [31]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [32]:
# dir(token)

In [33]:
new_doc = nlp(text)

websites = [token.text for token in new_doc if token.like_url]
websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

### Figuring out all transactions from this text with amount and currency

(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [34]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

In [35]:
new_doc = nlp(transactions)

for token in new_doc:
    if token.like_num and new_doc[token.i+1].is_currency:
        print(token.text, new_doc[token.i+1].text)  

500 €
