## ***Tokenization***
> It is a preprocessing technique of text data to devide a paragraph / text information into meaningful segments

In [1]:
import spacy

***Basic blank nlp object having only tokenization prop***

In [None]:
nlp = spacy.blank("en") # simple object can understand basic english

In [11]:
doc = nlp("\"Hi, Dr. Sumana you're N.A. for this three wheeler toy car ride!\"")
for token in doc:
    print(token)

"
Hi
,
Dr.
Sumana
you
're
N.A.
for
this
three
wheeler
toy
car
ride
!
"


***Attributes of tokens***

In [9]:
type(doc), type(token)

(spacy.tokens.doc.Doc, spacy.tokens.token.Token)

In [16]:
for token in doc:
    print(f"{token.i}: {token}")
    document = {
       'like_num': {token.like_num},
       'is_alphabet':  {token.is_alpha},
       'is_currency': {token.is_currency}
    }

    print(document)

0: "
{'like_num': {False}, 'is_alphabet': {False}, 'is_currency': {False}}
1: Hi
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}
2: ,
{'like_num': {False}, 'is_alphabet': {False}, 'is_currency': {False}}
3: Dr.
{'like_num': {False}, 'is_alphabet': {False}, 'is_currency': {False}}
4: Sumana
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}
5: you
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}
6: 're
{'like_num': {False}, 'is_alphabet': {False}, 'is_currency': {False}}
7: N.A.
{'like_num': {False}, 'is_alphabet': {False}, 'is_currency': {False}}
8: for
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}
9: this
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}
10: three
{'like_num': {True}, 'is_alphabet': {True}, 'is_currency': {False}}
11: wheeler
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}
12: toy
{'like_num': {False}, 'is_alphabet': {True}, 'is_currency': {False}}

***Customised tokenization***

In [17]:
from spacy.symbols import ORTH

In [None]:
nlp.tokenizer.add_special_case("wheeler",[
    {ORTH: "wheel"},
    {ORTH: "er"}
])  # this only helps to break a token into more subtokens (wheeler -> wheel + er), 
# don't permit to modigy the tokens (N.A. !-> Not + Applicable)

In [19]:
doc = nlp("\"Hi, Dr. Sumana you're N.A. for this three wheeler toy car ride!\"")
for token in doc:
    print(token)

"
Hi
,
Dr.
Sumana
you
're
N.A.
for
this
three
wheel
er
toy
car
ride
!
"


***Excersices***

In [20]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
''' # grab all urls from this paragraph

In [21]:
_doc1 = nlp(text)
urls = [token.text for token in _doc1 if token.like_url == True]
urls

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [22]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve" 
# Extract all money transaction from below sentence along with currency

In [25]:
_doc2 = nlp(transactions)
ledger = [{'amount': token, 'curr': _doc2[token.i+1]} 
          for token in _doc2 
          if token.like_num and _doc2[token.i + 1].is_currency]
ledger

[{'amount': two, 'curr': $}, {'amount': 500, 'curr': €}]