In [1]:
import spacy

In [2]:
nlp = spacy.blank('en')

In [3]:
doc = nlp('Dr. Strange loves pav bhaji of mumbai as it costs only 2$ per plate.')

for word in doc:
    print(word)

Dr.
Strange
loves
pav
bhaji
of
mumbai
as
it
costs
only
2
$
per
plate
.


In [4]:
doc[0]

Dr.

In [5]:
doc[1]

Strange

In [6]:
token = doc[1]

In [8]:
#dir(token)

In [9]:
token.is_alpha

True

In [10]:
token.is_digit

False

In [11]:
token.is_lower

False

In [12]:
type(token)

spacy.tokens.token.Token

In [13]:
span = doc[4:9]

In [14]:
span

bhaji of mumbai as it

In [15]:
type(span)

spacy.tokens.span.Span

In [16]:
for token in doc:
    print(token, "==>", token.i, "is_alpha:", token.is_alpha, 
         "is_punct:", token.is_punct,
         "like_num:", token.like_num,
         "is_currency:", token.is_currency)

Dr. ==> 0 is_alpha: False is_punct: False like_num: False is_currency: False
Strange ==> 1 is_alpha: True is_punct: False like_num: False is_currency: False
loves ==> 2 is_alpha: True is_punct: False like_num: False is_currency: False
pav ==> 3 is_alpha: True is_punct: False like_num: False is_currency: False
bhaji ==> 4 is_alpha: True is_punct: False like_num: False is_currency: False
of ==> 5 is_alpha: True is_punct: False like_num: False is_currency: False
mumbai ==> 6 is_alpha: True is_punct: False like_num: False is_currency: False
as ==> 7 is_alpha: True is_punct: False like_num: False is_currency: False
it ==> 8 is_alpha: True is_punct: False like_num: False is_currency: False
costs ==> 9 is_alpha: True is_punct: False like_num: False is_currency: False
only ==> 10 is_alpha: True is_punct: False like_num: False is_currency: False
2 ==> 11 is_alpha: False is_punct: False like_num: True is_currency: False
$ ==> 12 is_alpha: False is_punct: False like_num: False is_currency: True
p

In [21]:
#find out emails from given text file
with open("student.txt") as f:
    text = f.readlines()
text

['Dayton high school, 8th grade students information\n',
 '\n',
 'Name\tbirth day   \temail\n',
 '-----\t------------\t------\n',
 'Virat   5 June, 1882    virat@kohli.com\n',
 'Maria\t12 April, 2001  maria@sharapova.com\n',
 'Serena  24 June, 1998   serena@williams.com \n',
 'Joe      1 May, 1997    joe@root.com']

In [22]:
text = "".join(text)
text



In [23]:
doc = nlp(text)
emails = []

for token in doc:
    if token.like_email:
        emails.append(token.text)

emails        

['virat@kohli.com',
 'maria@sharapova.com',
 'serena@williams.com',
 'joe@root.com']

In [24]:
#customizing tokenizer

from spacy.tokenizer import ORTH

In [25]:
nlp = spacy.blank('en')
doc = nlp('gimme double cheese extra large healthy pizza')
tokens = [token.text for token in doc]
tokens

['gimme', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [26]:
nlp.tokenizer.add_special_case("gimme", [
    {ORTH: "gim"}, 
    {ORTH: "me"}])

doc = nlp('gimme double cheese extra large healthy pizza')
tokens = [token.text for token in doc]
tokens

['gim', 'me', 'double', 'cheese', 'extra', 'large', 'healthy', 'pizza']

In [27]:
#Sentence tokenizer or segmentation in spacy

doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

ValueError: [E030] Sentence boundaries unset. You can add the 'sentencizer' component to the pipeline with: `nlp.add_pipe('sentencizer')`. Alternatively, add the dependency parser or sentence recognizer, or set sentence boundaries by setting `doc[i].is_sent_start`.

In [28]:
nlp.pipe_labels

{}

In [29]:
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1b4ae61e908>

In [30]:
doc = nlp("Dr. Strange loves pav bhaji of mumbai. Hulk loves chat of delhi")
for sentence in doc.sents:
    print(sentence)

Dr. Strange loves pav bhaji of mumbai.
Hulk loves chat of delhi


In [31]:
text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

In [32]:
doc = nlp(text)

data_websites = [token.text for token in doc if token.like_url]
data_websites

['http://www.data.gov/',
 'http://www.science',
 'http://data.gov.uk/.',
 'http://www3.norc.org/gss+website/',
 'http://www.europeansocialsurvey.org/.']

In [33]:
transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

doc = nlp(transactions)

for token in doc:
    if token.like_num and doc[token.i+1].is_currency:
        print(token.text, doc[token.i+1].text)

two $
500 €
