Tokenization: Splitting text into meaningful segments

In [1]:
import spacy

In [2]:
nlp = spacy.blank("en")

In [3]:
doc = nlp("I will be going to bangalore soon in april")

for token in doc:
    print(token)

I
will
be
going
to
bangalore
soon
in
april


In [4]:
type(nlp)

spacy.lang.en.English

In [5]:
type(doc)

spacy.tokens.doc.Doc

In [6]:
substr = doc[1:5]
substr

will be going to

In [7]:
type(substr)

spacy.tokens.span.Span

In [8]:
type(doc[0])

spacy.tokens.token.Token

In [9]:
#Token as lot of method like is alpha, like num, is email etc.

In [10]:
dir(doc[0])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'le

In [12]:
nlp_hindi = spacy.blank("hi")

doc = nlp_hindi("प्रथम एनएलपी कक्षा में आपका स्वागत है। मुझे मेरे 5000 रुपये वापस चाहिए।")

for token in doc:
    print(token)
    print(token.is_currency, token.like_num)

प्रथम
False True
एनएलपी
False False
कक्षा
False False
में
False False
आपका
False False
स्वागत
False False
है
False False
।
False False
मुझे
False False
मेरे
False False
5000
False True
रुपये
False False
वापस
False False
चाहिए
False False
।
False False


In [13]:
#Custom rules

sample_string = "gimme double chesse extra large healthy pizza"

for tok in nlp(sample_string):
    print(tok)

gimme
double
chesse
extra
large
healthy
pizza


In [14]:
#now we want to add a custom rule to make gimme into give me, since it is not a valid word
# so we can add a custom rule
# we can't provide 'give' in the rule as it is not a part of the word
#but we can provide rule to split it as gim and me (split single word into simpler words)    
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"},
    {ORTH: "me"}
]
)

for tok in nlp(sample_string):
    print(tok)

gim
me
double
chesse
extra
large
healthy
pizza


## Exercise

(1) Think stats is a free book to study statistics (https://greenteapress.com/thinkstats2/thinkstats2.pdf)

This book has references to many websites from where you can download free datasets. You are an NLP engineer working for some company and you want to collect all dataset websites from this book. To keep exercise simple you are given a paragraph from this book and you want to grab all urls from this paragraph using spacy

In [24]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here
nlp = spacy.blank("en")

url_list = []
for token in nlp(text.replace('\n','')):
    if token.like_url:
        url_list.append(token)
        
print(url_list)

[http://www.data.gov/, http://www.science.gov/, http://data.gov.uk/.Two, http://www3.norc.org/gss+website/, http://www.europeansocialsurvey.org/.]


(2) Extract all money transaction from below sentence along with currency. Output should be,

two $

500 €

In [28]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here

doc = nlp(transactions)
currency_list = []
for idx, token in enumerate(doc):
    if token.is_currency and doc[idx-1].like_num:
        currency_list.append(doc[idx-1: idx+1])
        
print(currency_list)

[two $, 500 €]
