# spaCy

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
# https://spacy.io/models/en#en_core_web_sm

nlp

<spacy.lang.en.English at 0x7bf6ac333c70>

# Tokenization

In [None]:
doc1 = nlp("This tutorial is about Natural Language Processing in spaCy.")


for token in doc1 :
  print(token)


This
tutorial
is
about
Natural
Language
Processing
in
spaCy
.


In [None]:
for token in doc1:
  print(f'{token.text} ---- {token.idx  }')


This ---- 0
tutorial ---- 5
is ---- 14
about ---- 17
Natural ---- 23
Language ---- 31
Processing ---- 40
in ---- 51
spaCy ---- 54
. ---- 59


In [None]:
for token in doc1:
    print(
        f"{token.text:<50}",
        f"{str(token.is_alpha):<30}",
        f"{str(token.is_punct):<30}",
        f"{str(token.is_stop)}"
    )

This                                               True                           False                          True
tutorial                                           True                           False                          False
is                                                 True                           False                          True
about                                              True                           False                          True
Natural                                            True                           False                          False
Language                                           True                           False                          False
Processing                                         True                           False                          False
in                                                 True                           False                          True
spaCy                                              T

In [None]:
from spacy.tokenizer import Tokenizer

tokenizer = Tokenizer(nlp.vocab)

doc2 = tokenizer("This tutorial is about Natural Language Processing in spaCy.")

for token in doc2:
    print(token)

This
tutorial
is
about
Natural
Language
Processing
in
spaCy.


In [None]:
doc1 = nlp("This tutorial is about Natural Language Processing in spaCy. My email is john@doe.com")

for token in doc1 :
  print(token)


This
tutorial
is
about
Natural
Language
Processing
in
spaCy
.
My
email
is
john@doe.com


In [None]:
import re

infix_re = re.compile("|".join(nlp.Defaults.infixes + ["@"]))

nlp.tokenizer = Tokenizer(
    nlp.vocab,
    infix_finditer=infix_re.finditer,
)

doc1 = nlp("This tutorial is about Natural Language Processing in spaCy. My email is john@doe.com")

for token in doc1 :
  print(token)

This
tutorial
is
about
Natural
Language
Processing
in
spaCy.
My
email
is
john
@
doe.com


**Splitting Sentence**

In [None]:
doc = nlp(
    "This is a sentence. This is another sentence. This is the last sentence.")
for sent in doc.sents:
  print(sent)


This is a sentence.
This is another sentence.
This is the last sentence.


# Removing Stop Words

In [None]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

print(type(stopwords) , len(stopwords))

<class 'set'> 326


In [None]:
for stop_word in list(stopwords)[:10]:
     print(stop_word)

everywhere
had
must
for
many
sometimes
bottom
four
their
nothing


In [None]:
text = ''' stop words are typically defined as the most common words in a language. In the English language, some examples of stop words are the are but and they
Most sentences need to contain stop words in order to be full sentences that make grammatical sense.
'''
doc2 = nlp(text)

print([token for token in doc2 if  token.text not in stopwords])

[ , stop, words, typically, defined, common, words, language., In, English, language,, examples, stop, words, 
, Most, sentences, need, contain, stop, words, order, sentences, grammatical, sense., 
]


# Lemmatization

In [None]:
text = ('''Lemmatization is the process of reducing inflected forms of a word while still ensuring that the reduced form belongs to the language.
This reduced form, or root word, is called a lemma went'''
)
doc3 = nlp(text)

for token in doc3:
     if str(token) != str(token.lemma_):
         print(f"{str(token):<40} : {str(token.lemma_):<30} : {str(token.lemma):>60}")

Lemmatization                            : lemmatization                  :                                           438229042533536439
is                                       : be                             :                                         10382539506755952630
reducing                                 : reduce                         :                                         13655207319209475655
inflected                                : inflect                        :                                          6529725988763922079
forms                                    : form                           :                                         16052413196044414349
ensuring                                 : ensure                         :                                         15159758025291269046
reduced                                  : reduce                         :                                         13655207319209475655
belongs                                  

In [None]:
print(" ".join([token.lemma_ for token in doc3]))

lemmatization be the process of reduce inflect form of a word while still ensure that the reduce form belong to the language. 
 this reduce form, or root word, be call a lemma go


# Part-of-Speech Tagging


In [None]:
text = '''Part-of-speech tagging is the process of assigning a POS tag to each token depending on its usage in the sentence.
'''

doc4 = nlp(text)


for token in doc4:
     print(f""" TOKEN: {str(token):15}   TAG: {str(token.tag_):10}   POS: {token.pos_:10}  EXPLANATION: {spacy.explain(token.tag_):>10}"""
 )

 TOKEN: Part              TAG: NN           POS: NOUN        EXPLANATION: noun, singular or mass
 TOKEN: -                 TAG: HYPH         POS: PUNCT       EXPLANATION: punctuation mark, hyphen
 TOKEN: of                TAG: IN           POS: ADP         EXPLANATION: conjunction, subordinating or preposition
 TOKEN: -                 TAG: HYPH         POS: PUNCT       EXPLANATION: punctuation mark, hyphen
 TOKEN: speech            TAG: NN           POS: NOUN        EXPLANATION: noun, singular or mass
 TOKEN: tagging           TAG: NN           POS: NOUN        EXPLANATION: noun, singular or mass
 TOKEN: is                TAG: VBZ          POS: AUX         EXPLANATION: verb, 3rd person singular present
 TOKEN: the               TAG: DT           POS: DET         EXPLANATION: determiner
 TOKEN: process           TAG: NN           POS: NOUN        EXPLANATION: noun, singular or mass
 TOKEN: of                TAG: IN           POS: ADP         EXPLANATION: conjunction, subordinating or p

In [None]:
nouns = []
adjectives = []
for token in doc4:
     if token.pos_ == "NOUN":
         nouns.append(token)
     if token.pos_ == "ADJ":
         adjectives.append(token)

print(nouns)
print(adjectives)

[Part, speech, tagging, process, tag, usage, sentence.]
[token]


# Named Entity Recognition (NER)

In [None]:
text = " Elon Musk , CEO of Tesla , was born in South Africa and moved to the United States in 1992. He founded Tesla Motors in 2003 and SpaceX in 2002."

doc5 = nlp(text)

for ent in doc5.ents :
     print(f""" Toekn : {ent.text :40} Label: {ent.label_  :<40}  Explaination := {spacy.explain(ent.label_):<20}"""
 )

 Toekn : Elon Musk                                Label: PERSON                                    Explaination := People, including fictional
 Toekn : Tesla                                    Label: ORG                                       Explaination := Companies, agencies, institutions, etc.
 Toekn : South Africa                             Label: GPE                                       Explaination := Countries, cities, states
 Toekn : the United States                        Label: GPE                                       Explaination := Countries, cities, states
 Toekn : Tesla Motors                             Label: ORG                                       Explaination := Companies, agencies, institutions, etc.
 Toekn : 2003                                     Label: DATE                                      Explaination := Absolute or relative dates or periods
 Toekn : 2002.                                    Label: CARDINAL                                  Explaination 

In [None]:
org = []
person = []
for ent in doc5.ents:
     if ent.label_ == "ORG":
         org.append(ent.text)
     if ent.label_ == "PERSON":
         person.append(ent.text)

print(org)
print(person)

['Tesla', 'Tesla Motors']
['Elon Musk']


# Visualization

In [None]:
from spacy import displacy

text = "He is interested in learning Natural Language Processing."
doc6 = nlp(text)


displacy.render(doc6, style="dep" , jupyter=True)

In [None]:
# Define a color scheme for the visualization
options = {
    'compact': True,
    'bg': '#09a3d5',  # Background color
    'color': '#FFFFFF',  # Arrow color
    'font': 'Source Sans Pro',
    'node_color': '#fa8072',  # Node background color
    'edge_color': '#ffffff'  # Edge/arrow color
}

displacy.render(doc6, style="dep", options=options, jupyter=True)