# SpaCy Models
https://spacy.io/usage/models

In [0]:
# !pip install -U spacy
# !pip install -U spacy-lookups-data
# !python -m spacy download en_core_web_sm

### 
- Processing raw text intelligently is difficult: most words are rare, and it’s common for words that look completely different to mean almost the same thing. 
- The same words in a different order can mean something completely different.
- While it’s possible to solve some problems starting from only the raw characters, it’s usually better to use linguistic knowledge to add useful information.
![alt text](https://spacy.io/pipeline-7a14d4edd18f3edfee8f34393bff2992.svg)




## Tokenizer
Tokenization is the task of splitting a text into meaningful segments, called tokens. The input to the tokenizer is a unicode text, and the output is a Doc object.*italicized text*

In [0]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm') # spacy trained model
nlp

<spacy.lang.en.English at 0x7fa3b0633f28>

In [0]:
doc = nlp('Apple is looking at buying U.K. startup for $1 Billion')

In [5]:
for token in doc:
  print(token.text)

Apple
is
looking
at
buying
U.K.
startup
for
$
1
Billion


In [6]:
doc = nlp("Apple isn't looking at buying U.K. startup for $1 Billion")
for token in doc:
  print(token.text)

Apple
is
n't
looking
at
buying
U.K.
startup
for
$
1
Billion


## Tagger - Part of Speech Tagging -- POS




In [7]:
doc

Apple isn't looking at buying U.K. startup for $1 Billion

In [8]:
for token in doc:
  print(token.text, token.lemma_)

Apple Apple
is be
n't not
looking look
at at
buying buy
U.K. U.K.
startup startup
for for
$ $
1 1
Billion billion


In [9]:
for token in doc:
  print(f'{token.text:{15}} {token.lemma_:{15}} {token.pos_:{10}} {token.is_stop}')

Apple           Apple           PROPN      False
is              be              AUX        True
n't             not             PART       True
looking         look            VERB       False
at              at              ADP        True
buying          buy             VERB       False
U.K.            U.K.            PROPN      False
startup         startup         NOUN       False
for             for             ADP        True
$               $               SYM        False
1               1               NUM        False
Billion         billion         NUM        False


## Parser - Dependency Parsing

In [10]:
# print(doc.noun_chunks)
for chunk in doc.noun_chunks:
  print(f'{chunk.text:{15}} {chunk.root.text:{15}} {chunk.root.dep_}')


Apple           Apple           nsubj
U.K. startup    startup         dobj


## Named Entity Recognition
#### Example 1: "Apple is looking at buying U.K. startup for $1 billion"
#### Example 2: "Ishu ate the Apple"

In [11]:
doc

Apple isn't looking at buying U.K. startup for $1 Billion

In [12]:
# print(doc.ents)
for ent in doc.ents:
  print(f'{ent.text:{15}} {ent.label_:{5}}')

Apple           ORG  
U.K.            GPE  
$1 Billion      MONEY


## Sentence Segmentation

In [13]:
doc.sents

<generator at 0x7fa3b837d438>

In [14]:
doc

Apple isn't looking at buying U.K. startup for $1 Billion

In [15]:
for sent in doc.sents:
  print(sent)

Apple isn't looking at buying U.K. startup for $1 Billion


In [0]:
doc1 = nlp("Welcome to 714 E. 10th Street. Thanks for watching. Please bring food and gifts.")

In [17]:
for sent in doc1.sents:
  print(sent)

Welcome to 714 E. 10th Street.
Thanks for watching.
Please bring food and gifts.


In [18]:
doc2 = nlp("Welcome to 714 E. 10th Street? Thanks for watching.")
for sent in doc2.sents:
  print(sent)

Welcome to 714 E. 10th Street?
Thanks for watching.


In [19]:
doc3 = nlp("Welcome to 714 E. 10th Street_ Thanks for watching.")
for sent in doc3.sents:
  print(sent)

Welcome to 714 E. 10th Street_ Thanks for watching.


In [20]:
doc4 = nlp("Welcome to 714 E.... 10th Street .... Thanks for watching.")
for sent in doc4.sents:
  print(sent)

Welcome to 714 E....
10th Street ....
Thanks for watching.


In [32]:
doc5 = nlp("Welcome to.*.KGP Talkie.*.Thanks for watching")
for sent in doc5.sents:
  print(sent)

Welcome to.*.KGP
Talkie.*.Thanks for watching


In [0]:
# custom rule
def set_rule(doc):
  for token in doc[:-1]:
    if token.text == '...':
      doc[token.i + 1].is_sent_start = True
  return doc


In [64]:
nlp.remove_pipe('set_rule')

('set_rule', <function __main__.set_rule>)

In [0]:
nlp.add_pipe(set_rule, before='parser')

In [66]:
doc6 = nlp("Welcome to KGP Talkie...Thanks... Like and Subsribe!")
for sent in doc6.sents:
  print(sent)

Welcome to KGP Talkie...
Thanks...
Like and Subsribe!


## Visualization 

In [0]:
from spacy import displacy

In [69]:
doc6

Welcome to KGP Talkie...Thanks... Like and Subsribe!

In [72]:
displacy.render(doc6, style='dep', jupyter=True, options={'distance': 90})

In [74]:
displacy.render(doc6, style='dep', options={'compact':True, 'distance': 90}, jupyter=True)

In [75]:
doc

Apple isn't looking at buying U.K. startup for $1 Billion

In [78]:
displacy.render(doc, style='ent', jupyter=True)