In [1]:
!pip install nltk --quiet

In [2]:
!pip install spacy  --quiet

In [3]:
!python -m spacy download en --quiet

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [4]:
!python -m spacy download en_core_web_sm --quiet

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


# Now lets understand the spacy and nltk in action

## Spacy

In [7]:
import spacy

In [8]:
#Loading the spaCy English Language Model 

nlp = spacy.load("en_core_web_sm")

# "en_core_web_sm" is a pre-trained small English model in spaCy which ncludes a tokenizer, part-of-speech (POS) tagger, named entity recognizer (NER), and dependency parser.

doc = nlp("Doing some random spacy vs nltk to show the difference between them. Spacy is object oriented and nltk is string processing library")

for sentence in doc.sents:
    print(sentence)

Doing some random spacy vs nltk to show the difference between them.
Spacy is object oriented and nltk is string processing library


### This is called the sentence tokenization in spacy

In [10]:
for sentence in doc.sents:
    for word in sentence:
        print(word)

Doing
some
random
spacy
vs
nltk
to
show
the
difference
between
them
.
Spacy
is
object
oriented
and
nltk
is
string
processing
library


### And this is called the word tokenization with the use of spacy

## nltk

In [13]:
import nltk

nltk.download("punkt_tab")

#nltk.download("punkt") → Downloads the Punkt tokenizer, which is used for sentence and word tokenization.

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/satishadhikari/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### sentence tokenizer using nltk

In [15]:
from nltk.tokenize import sent_tokenize

sent_tokenize("Doing some random spacy vs nltk to show the difference between them. Spacy is object oriented and nltk is string processing library")


['Doing some random spacy vs nltk to show the difference between them.',
 'Spacy is object oriented and nltk is string processing library']

### word tokenizer using nltk

In [17]:
from nltk.tokenize import word_tokenize

word_tokenize("Doing some random spacy vs nltk to show the difference between them. Spacy is object oriented and nltk is string processing library")


['Doing',
 'some',
 'random',
 'spacy',
 'vs',
 'nltk',
 'to',
 'show',
 'the',
 'difference',
 'between',
 'them',
 '.',
 'Spacy',
 'is',
 'object',
 'oriented',
 'and',
 'nltk',
 'is',
 'string',
 'processing',
 'library']

# Let's go deeper with spaCy

In [19]:
import spacy


In [20]:
nlp = spacy.blank("en")
#creating a blank english language component
#spacy language model for English


In [21]:
doc = nlp('''"Dr. X has consulted me to visit the hospital for the time interval of 6 months. But I don't seem to obey."''')

In [22]:
for token in doc:
    print(token)

"
Dr.
X
has
consulted
me
to
visit
the
hospital
for
the
time
interval
of
6
months
.
But
I
do
n't
seem
to
obey
.
"


In [23]:
doc[0]

"

In [24]:
doc[0:5] 
#In NLP sometimes we need a span just like how we do index slicing in python. Span is nothing but a substring of a given string or slice of tokens in a document

span = doc[0:7]
span

"Dr. X has consulted me to

In [25]:
doc[:-1]

"Dr. X has consulted me to visit the hospital for the time interval of 6 months. But I don't seem to obey.

In [26]:
doc[-1:]

"

In [27]:
type(nlp)

spacy.lang.en.English

In [28]:
type(doc)

spacy.tokens.doc.Doc

In [29]:
type(span)

spacy.tokens.span.Span

## Token Attributes

In [31]:
doc1 = nlp("I have 1$ in my pocket")
token1 = doc1[1]
token1

have

In [32]:
dir(token1)    
#this gives all the methods of the class Token

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [33]:
type(token1)

spacy.tokens.token.Token

In [34]:
token1.is_currency

False

In [35]:
token1.is_lower      #shows that the token1 which has the value 'have' is in a lower case

True

In [36]:
token1.is_alpha        #shows token 1 is an alphabet

True

In [37]:
token0  = doc[0]
token0.is_lower

False

In [38]:
token2 = doc[2]
token2.text

'X'

In [39]:
token3 = doc1[3]
token3

$

In [40]:
token3.is_currency

True

In [41]:
for x in doc1:
    print(x , '==>', 'index: ', x.i, 'is_alpha:', x.is_alpha,
          'is_punct;', x.is_punct,
          'like_num:', x.like_num,
          'is_currency:', x.is_currency,)

I ==> index:  0 is_alpha: True is_punct; False like_num: False is_currency: False
have ==> index:  1 is_alpha: True is_punct; False like_num: False is_currency: False
1 ==> index:  2 is_alpha: False is_punct; False like_num: True is_currency: False
$ ==> index:  3 is_alpha: False is_punct; False like_num: False is_currency: True
in ==> index:  4 is_alpha: True is_punct; False like_num: False is_currency: False
my ==> index:  5 is_alpha: True is_punct; False like_num: False is_currency: False
pocket ==> index:  6 is_alpha: True is_punct; False like_num: False is_currency: False


### We made the use of regular expression for extracting required texts, emails, phone numbers from the big bulk of text. But lets use spacy to do the same task because it is much powerful and convenient

In [43]:
text = 'satish: Hello,I need the info of my order # 826398312 satish: I got an issue with my order number 826398312 satish: My order 826398312 is having an issue, I was charged 300$ when online it says 280$'

In [44]:
doc2 = nlp(text)
number = [] 
for token in doc2:
    if token.like_num:
       number.append(token.text)
number

['826398312', '826398312', '826398312', '300', '280']

## Customize Tokenization Rule

In [46]:
doc9 = nlp('imma do this task now ')
tokens = [token.text for token in doc9]
tokens

['imma', 'do', 'this', 'task', 'now']

## What if we want to split the slang imma ?

In [None]:
from spacy.symbold import ORTH
nlp.tokenizer.add_special_case("imma", [
    {ORTH : "im"