# Installation

In [3]:
# pip install spacy
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_md
# python -m spacy download en_core_web_lg

# Importing modules and libraries

In [4]:
import spacy 

nlp = spacy.load('en_core_web_sm')

text = """Queen Elizabeth II made personal additions to plans for her funeral day, Buckingham Palace has said.
Among the touches requested by the Queen is the playing of a lament by her piper.
The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two."""

doc = nlp(text)



# Tokenization & Span

In [8]:
print([token.text for token in doc])
print(len(doc))
print(len([token.text for token in doc]))

['Queen', 'Elizabeth', 'II', 'made', 'personal', 'additions', 'to', 'plans', 'for', 'her', 'funeral', 'day', ',', 'Buckingham', 'Palace', 'has', 'said', '.', '\n', 'Among', 'the', 'touches', 'requested', 'by', 'the', 'Queen', 'is', 'the', 'playing', 'of', 'a', 'lament', 'by', 'her', 'piper', '.', '\n', 'The', 'state', 'funeral', 'at', 'Westminster', 'Abbey', 'on', 'Monday', 'is', 'likely', 'to', 'be', 'one', 'of', 'the', 'biggest', 'single', 'ceremonial', 'events', 'staged', 'in', 'Britain', 'since', 'World', 'War', 'Two', '.']
64
64


In [11]:
for token in doc:
    pass

print(type(token))

<class 'spacy.tokens.token.Token'>


In [13]:
span = doc[:5]
print(span.text, type(span))

Queen Elizabeth II made personal <class 'spacy.tokens.span.Span'>


# Read from a file

In [15]:
news_text = open('news.txt').read()
news_doc = nlp(news_text)
print([token.text for token in news_doc])

['Queen', 'Elizabeth', 'II', 'made', 'personal', 'additions', 'to', 'plans', 'for', 'her', 'funeral', 'day', ',', 'Buckingham', 'Palace', 'has', 'said', '.', '\n', 'Among', 'the', 'touches', 'requested', 'by', 'the', 'Queen', 'is', 'the', 'playing', 'of', 'a', 'lament', 'by', 'her', 'piper', '.', '\n', 'The', 'state', 'funeral', 'at', 'Westminster', 'Abbey', 'on', 'Monday', 'is', 'likely', 'to', 'be', 'one', 'of', 'the', 'biggest', 'single', 'ceremonial', 'events', 'staged', 'in', 'Britain', 'since', 'World', 'War', 'Two', '.']


# Communication in Spacy

In [18]:
type(nlp.vocab.strings)
spacy.strings.StringStore
nlp.vocab.strings["Elizabeth"]  # Hash code of the string

5479190301435931735

In [19]:
nlp.vocab.strings[5479190301435931735]

'Elizabeth'

# Sentence detection

In [22]:
sentences = list(doc.sents)
print(len(sentences))
for s in sentences:
    print(s)

3
Queen Elizabeth II made personal additions to plans for her funeral day, Buckingham Palace has said.

Among the touches requested by the Queen is the playing of a lament by her piper.

The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two.


In [29]:
text = """I went to visit the Niagara waterfalls last weekend. It was a magnificient show of nature at its glory. The horse shoe falls moves south to east and has several whirls along its rocky terrain. The white water walk was the highest part of this once in a lifetime tour...I would definitely want to come back again"""
doc = nlp(text)
for s in list(doc.sents):
    print(s)

I went to visit the Niagara waerfalls last weekend.
It was a magnificient show of nature at its glory.
The horse shoe falls moves south to east and has several whirls along its rocky terrain.
The white water walk was the highest part of this once in a lifetime tour...I would definitely want to come back again


In [33]:
from spacy.language import Language

@Language.component("component")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True 
    return doc

st_nlp = spacy.load('en_core_web_sm')
st_nlp.add_pipe("component", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [34]:
text = """I went to visit the Niagara waterfalls last weekend. It was a magnificient show of nature at its glory. The horse shoe falls moves south to east and has several whirls along its rocky terrain. The white water walk was the highest part of this once in a lifetime tour...I would definitely want to come back again"""
doc = st_nlp(text)
print([token.text for token in doc])

for s in list(doc.sents):
    print(s)

['I', 'went', 'to', 'visit', 'the', 'Niagara', 'waerfalls', 'last', 'weekend', '.', 'It', 'was', 'a', 'magnificient', 'show', 'of', 'nature', 'at', 'its', 'glory', '.', 'The', 'horse', 'shoe', 'falls', 'moves', 'south', 'to', 'east', 'and', 'has', 'several', 'whirls', 'along', 'its', 'rocky', 'terrain', '.', 'The', 'white', 'water', 'walk', 'was', 'the', 'highest', 'part', 'of', 'this', 'once', 'in', 'a', 'lifetime', 'tour', '...', 'I', 'would', 'definitely', 'want', 'to', 'come', 'back', 'again']
I went to visit the Niagara waerfalls last weekend.
It was a magnificient show of nature at its glory.
The horse shoe falls moves south to east and has several whirls along its rocky terrain.
The white water walk was the highest part of this once in a lifetime tour...
I would definitely want to come back again


## Tokenization Details
* text_with_ws : Prints token with trailing space
* is_alpha : Consists of alphabetic characters or not
* is_space : Detects a space
* is_punct : Punctutaion symbol or not
* shape_ : Shape of the word. Also distinguishes numerals and punctutaions.

In [45]:
nlp = spacy.load('en_core_web_sm')

text = """Queen Elizabeth II (1926 - 2022) made personal additions to plans for her funeral day, Buckingham Palace has said.
Among the touches requested by the Queen is the playing of a lament by her piper.
The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two."""

doc = nlp(text)

for token in doc:
    print(f"{token.text:<15} {token.idx:<4} {token.text_with_ws:<20}\
        {token.is_alpha:<6} {token.is_punct:<6} {token.is_space:<6} {token.shape_:<20} {token.is_stop}")


Queen           0    Queen                       1      0      0      Xxxxx                False
Elizabeth       6    Elizabeth                   1      0      0      Xxxxx                False
II              16   II                          1      0      0      XX                   False
(               19   (                           0      1      0      (                    False
1926            20   1926                        0      0      0      dddd                 False
-               25   -                           0      1      0      -                    False
2022            27   2022                        0      0      0      dddd                 False
)               31   )                           0      1      0      )                    False
made            33   made                        1      0      0      xxxx                 True
personal        38   personal                    1      0      0      xxxx                 False
additions       47   additions 

In [48]:
print("Index :        ", [token.i for token in doc[:10]])
print("Text :         ", [token.text for token in doc[:10]])
print("is_alpha :     ", [token.is_alpha for token in doc[:10]])
print("like_num :     ", [token.like_num for token in doc[:10]])
print("Base word :    ", [token.lemma_ for token in doc[:10]])

Index :         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Text :          ['Queen', 'Elizabeth', 'II', '(', '1926', '-', '2022', ')', 'made', 'personal']
is_alpha :      [True, True, True, False, False, False, False, False, True, True]
like_num :      [False, False, False, False, True, False, True, False, False, False]
Base word :     ['Queen', 'Elizabeth', 'II', '(', '1926', '-', '2022', ')', 'make', 'personal']


# Stopwords

In [None]:
import spacy 
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(len(spacy_stopwords))

list(spacy_stopwords)[:20]

In [58]:
nlp = spacy.load('en_core_web_sm')

text = """Queen Elizabeth II made personal additions to plans for her funeral day, Buckingham Palace has said.
Among the touches requested by the Queen is the playing of a lament by her piper.
The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two.
A national two-minute silence will be held as the service draws to a close just before midday.
The order of service, with its choice of music and readings, is expected to reflect more of the Queen's personal choices for the funeral.
Palace aides say the Queen had been consulted on all the arrangements.
Among those to confirm their attendance at the state funeral, which starts at 11:00 BST, are US President Joe Biden and French President Emmanuel Macron.
Alongside royalty, politicians and world leaders will be 200 people who were recognised in the Queen's Birthday Honours, including those who helped with the response to the coronavirus pandemic.
A former police officer awarded the George Cross after being shot 15 times is among those due to attend the state funeral.
Tony Gledhill, 84, said: "I'm incredibly moved to be involved."
The official organiser of Monday's events, the Earl Marshal, the Duke of Norfolk, said his role was "both humbling and daunting"."""

doc = nlp(text)

# tokens_wo_sw = []
# for token in doc:
#     if not token.is_stop:
#         tokens_wo_sw.append(token)
# #print(tokens_wo_sw)

print([token for token in doc if  token.is_stop])

[made, to, for, her, has, Among, the, by, the, is, the, of, a, by, her, The, at, on, is, to, be, one, of, the, in, since, Two, A, two, will, be, as, the, to, a, just, before, The, of, with, its, of, and, is, to, more, of, the, 's, for, the, say, the, had, been, on, all, the, Among, those, to, their, at, the, which, at, are, US, and, and, will, be, who, were, in, the, 's, those, who, with, the, to, the, A, former, the, after, being, is, among, those, due, to, the, I, 'm, to, be, The, of, 's, the, the, of, his, was, both, and]


# Lemmatization
Reduced form or root word is called lemma.
Changes tense, number, etc.

In [59]:
for token in doc:
    print(f"{token.text:<20}  {token.lemma_:<20}")

Queen                 Queen               
Elizabeth             Elizabeth           
II                    II                  
made                  make                
personal              personal            
additions             addition            
to                    to                  
plans                 plan                
for                   for                 
her                   her                 
funeral               funeral             
day                   day                 
,                     ,                   
Buckingham            Buckingham          
Palace                Palace              
has                   have                
said                  say                 
.                     .                   

                     
                   
Among                 among               
the                   the                 
touches               touch               
requested             request             
by         

# Word Frequency

In [62]:
from collections import Counter 

words = [token.text for token in doc if not token.is_stop and not token.is_punct and not token.text == '\n']

word_freq = Counter(words)

common_words = word_freq.most_common(10)
print(common_words)

[('Queen', 5), ('funeral', 5), ('said', 3), ('state', 3), ('personal', 2), ('Palace', 2), ('Monday', 2), ('events', 2), ('service', 2), ('President', 2)]


In [66]:
# Unique words
unique_words = {word for (word, freq) in word_freq.items() }
print(unique_words)

{'those', 'are', 'be', 'one', 'at', ':', 'attendance', 'Earl', 'people', 'Tony', 'Duke', 'likely', 'former', 'Monday', 'personal', 'Emmanuel', 'helped', 'just', 'consulted', 'expected', 'arrangements', 'since', 'staged', 'more', '-', 'biggest', 'said', 'reflect', 'two', 'both', 'World', 'officer', 'which', 'French', 'among', 'on', "'s", 'pandemic', 'A', '15', 'with', 'Among', 'national', 'readings', '200', 'music', 'minute', ',', 'lament', 'playing', 'I', 'Norfolk', 'was', 'times', 'held', '"', 'world', 'close', 'silence', 'plans', '.', '84', 'moved', 'requested', 'for', 'who', 'Marshal', 'Britain', 'Westminster', 'official', 'as', 'Macron', 'will', 'Gledhill', '11:00', 'by', 'midday', 'War', 'day', 'politicians', 'involved', 'choices', 'due', 'confirm', 'Honours', 'made', 'recognised', 'of', 'touches', 'order', 'to', 'ceremonial', 'and', 'Palace', 'his', 'Biden', 'President', 'Queen', 'shot', 'the', 'including', 'daunting', 'additions', 'The', 'role', 'police', 'a', 'response', 'were'

In [64]:
# Without reviewing stopwords
words = [token.text for token in doc ]

word_freq = Counter(words)

common_words = word_freq.most_common(10)
print(common_words)

[('the', 17), (',', 12), ('.', 11), ('\n', 10), ('to', 8), ('of', 7), ('Queen', 5), ('funeral', 5), ('is', 4), ('be', 4)]


# POS Tagging

Eight parts of speech:
1. Noun
2. Pronoun
3. verb
4. Adjective
5. Adverb
6. Preposition
7. COnjunction
8. Interjection

* pos_ : Coarse grained POS
* tag_ : Fine grained POS

In [70]:
for token in doc:
    print(f"{token.text :<20} {token.pos_:<20} {token.tag_:<20} {token.head.text:<20} {spacy.explain(token.tag_)}")

Queen                PROPN                NNP                  II                   noun, proper singular
Elizabeth            PROPN                NNP                  II                   noun, proper singular
II                   PROPN                NNP                  made                 noun, proper singular
made                 VERB                 VBD                  said                 verb, past tense
personal             ADJ                  JJ                   additions            adjective (English), other noun-modifier (Chinese)
additions            NOUN                 NNS                  made                 noun, plural
to                   ADP                  IN                   made                 conjunction, subordinating or preposition
plans                NOUN                 NNS                  to                   noun, plural
for                  ADP                  IN                   plans                conjunction, subordinating or preposition


In [71]:
from spacy import displacy

displacy.render(doc, style="dep", jupyter=True)