# Installation

In [90]:
# pip install spacy
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_md
# python -m spacy download en_core_web_lg

# Importing modules and libraries

In [91]:
import spacy 

nlp = spacy.load('en_core_web_sm')

text = """Queen Elizabeth II made personal additions to plans for her funeral day, Buckingham Palace has said.
Among the touches requested by the Queen is the playing of a lament by her piper.
The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two."""

doc = nlp(text)



# Tokenization & Span

In [92]:
print([token.text for token in doc])
print(len(doc))
print(len([token.text for token in doc]))

['Queen', 'Elizabeth', 'II', 'made', 'personal', 'additions', 'to', 'plans', 'for', 'her', 'funeral', 'day', ',', 'Buckingham', 'Palace', 'has', 'said', '.', '\n', 'Among', 'the', 'touches', 'requested', 'by', 'the', 'Queen', 'is', 'the', 'playing', 'of', 'a', 'lament', 'by', 'her', 'piper', '.', '\n', 'The', 'state', 'funeral', 'at', 'Westminster', 'Abbey', 'on', 'Monday', 'is', 'likely', 'to', 'be', 'one', 'of', 'the', 'biggest', 'single', 'ceremonial', 'events', 'staged', 'in', 'Britain', 'since', 'World', 'War', 'Two', '.']
64
64


In [93]:
for token in doc:
    pass

print(type(token))

<class 'spacy.tokens.token.Token'>


In [94]:
span = doc[:5]
print(span.text, type(span))

Queen Elizabeth II made personal <class 'spacy.tokens.span.Span'>


# Read from a file

In [95]:
news_text = open('news.txt').read()
news_doc = nlp(news_text)
print([token.text for token in news_doc])

['Queen', 'Elizabeth', 'II', 'made', 'personal', 'additions', 'to', 'plans', 'for', 'her', 'funeral', 'day', ',', 'Buckingham', 'Palace', 'has', 'said', '.', '\n', 'Among', 'the', 'touches', 'requested', 'by', 'the', 'Queen', 'is', 'the', 'playing', 'of', 'a', 'lament', 'by', 'her', 'piper', '.', '\n', 'The', 'state', 'funeral', 'at', 'Westminster', 'Abbey', 'on', 'Monday', 'is', 'likely', 'to', 'be', 'one', 'of', 'the', 'biggest', 'single', 'ceremonial', 'events', 'staged', 'in', 'Britain', 'since', 'World', 'War', 'Two', '.']


# Communication in Spacy

In [96]:
type(nlp.vocab.strings)
spacy.strings.StringStore
nlp.vocab.strings["Elizabeth"]  # Hash code of the string

5479190301435931735

In [97]:
nlp.vocab.strings[5479190301435931735]

'Elizabeth'

# Sentence detection

In [98]:
sentences = list(doc.sents)
print(len(sentences))
for s in sentences:
    print(s)

3
Queen Elizabeth II made personal additions to plans for her funeral day, Buckingham Palace has said.

Among the touches requested by the Queen is the playing of a lament by her piper.

The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two.


In [99]:
text = """I went to visit the Niagara waterfalls last weekend. It was a magnificient show of nature at its glory. The horse shoe falls moves south to east and has several whirls along its rocky terrain. The white water walk was the highest part of this once in a lifetime tour...I would definitely want to come back again"""
doc = nlp(text)
for s in list(doc.sents):
    print(s)

I went to visit the Niagara waterfalls last weekend.
It was a magnificient show of nature at its glory.
The horse shoe falls moves south to east and has several whirls along its rocky terrain.
The white water walk was the highest part of this once in a lifetime tour...I would definitely want to come back again


In [100]:
from spacy.language import Language

@Language.component("component")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True 
    return doc

st_nlp = spacy.load('en_core_web_sm')
st_nlp.add_pipe("component", before="parser")

<function __main__.set_custom_boundaries(doc)>

In [101]:
text = """I went to visit the Niagara waterfalls last weekend. It was a magnificient show of nature at its glory. The horse shoe falls moves south to east and has several whirls along its rocky terrain. The white water walk was the highest part of this once in a lifetime tour...I would definitely want to come back again"""
doc = st_nlp(text)
print([token.text for token in doc])

for s in list(doc.sents):
    print(s)

['I', 'went', 'to', 'visit', 'the', 'Niagara', 'waterfalls', 'last', 'weekend', '.', 'It', 'was', 'a', 'magnificient', 'show', 'of', 'nature', 'at', 'its', 'glory', '.', 'The', 'horse', 'shoe', 'falls', 'moves', 'south', 'to', 'east', 'and', 'has', 'several', 'whirls', 'along', 'its', 'rocky', 'terrain', '.', 'The', 'white', 'water', 'walk', 'was', 'the', 'highest', 'part', 'of', 'this', 'once', 'in', 'a', 'lifetime', 'tour', '...', 'I', 'would', 'definitely', 'want', 'to', 'come', 'back', 'again']
I went to visit the Niagara waterfalls last weekend.
It was a magnificient show of nature at its glory.
The horse shoe falls moves south to east and has several whirls along its rocky terrain.
The white water walk was the highest part of this once in a lifetime tour...
I would definitely want to come back again


## Tokenization Details
* text_with_ws : Prints token with trailing space
* is_alpha : Consists of alphabetic characters or not
* is_space : Detects a space
* is_punct : Punctutaion symbol or not
* shape_ : Shape of the word. Also distinguishes numerals and punctutaions.

In [102]:
nlp = spacy.load('en_core_web_sm')

text = """Queen Elizabeth II (1926 - 2022) made personal additions to plans for her funeral day, Buckingham Palace has said.
Among the touches requested by the Queen is the playing of a lament by her piper.
The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two."""

doc = nlp(text)

for token in doc:
    print(f"{token.text:<15} {token.idx:<4} {token.text_with_ws:<20}\
        {token.is_alpha:<6} {token.is_punct:<6} {token.is_space:<6} {token.shape_:<20} {token.is_stop}")


Queen           0    Queen                       1      0      0      Xxxxx                False
Elizabeth       6    Elizabeth                   1      0      0      Xxxxx                False
II              16   II                          1      0      0      XX                   False
(               19   (                           0      1      0      (                    False
1926            20   1926                        0      0      0      dddd                 False
-               25   -                           0      1      0      -                    False
2022            27   2022                        0      0      0      dddd                 False
)               31   )                           0      1      0      )                    False
made            33   made                        1      0      0      xxxx                 True
personal        38   personal                    1      0      0      xxxx                 False
additions       47   additions 

In [103]:
print("Index :        ", [token.i for token in doc[:10]])
print("Text :         ", [token.text for token in doc[:10]])
print("is_alpha :     ", [token.is_alpha for token in doc[:10]])
print("like_num :     ", [token.like_num for token in doc[:10]])
print("Base word :    ", [token.lemma_ for token in doc[:10]])

Index :         [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Text :          ['Queen', 'Elizabeth', 'II', '(', '1926', '-', '2022', ')', 'made', 'personal']
is_alpha :      [True, True, True, False, False, False, False, False, True, True]
like_num :      [False, False, False, False, True, False, True, False, False, False]
Base word :     ['Queen', 'Elizabeth', 'II', '(', '1926', '-', '2022', ')', 'make', 'personal']


# Stopwords

In [104]:
import spacy 
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(len(spacy_stopwords))

list(spacy_stopwords)[:20]

326


['top',
 'his',
 'fifteen',
 'what',
 'onto',
 'although',
 'three',
 'however',
 'over',
 'or',
 'namely',
 'how',
 'hereupon',
 'do',
 'whose',
 'our',
 'be',
 'full',
 'fifty',
 'until']

In [105]:
nlp = spacy.load('en_core_web_sm')

text = """Queen Elizabeth II made personal additions to plans for her funeral day, Buckingham Palace has said.
Among the touches requested by the Queen is the playing of a lament by her piper.
The state funeral at Westminster Abbey on Monday is likely to be one of the biggest single ceremonial events staged in Britain since World War Two.
A national two-minute silence will be held as the service draws to a close just before midday.
The order of service, with its choice of music and readings, is expected to reflect more of the Queen's personal choices for the funeral.
Palace aides say the Queen had been consulted on all the arrangements.
Among those to confirm their attendance at the state funeral, which starts at 11:00 BST, are US President Joe Biden and French President Emmanuel Macron.
Alongside royalty, politicians and world leaders will be 200 people who were recognised in the Queen's Birthday Honours, including those who helped with the response to the coronavirus pandemic.
A former police officer awarded the George Cross after being shot 15 times is among those due to attend the state funeral.
Tony Gledhill, 84, said: "I'm incredibly moved to be involved."
The official organiser of Monday's events, the Earl Marshal, the Duke of Norfolk, said his role was "both humbling and daunting"."""

doc = nlp(text)

# tokens_wo_sw = []
# for token in doc:
#     if not token.is_stop:
#         tokens_wo_sw.append(token)
# #print(tokens_wo_sw)

print([token for token in doc if  token.is_stop])

[made, to, for, her, has, Among, the, by, the, is, the, of, a, by, her, The, at, on, is, to, be, one, of, the, in, since, Two, A, two, will, be, as, the, to, a, just, before, The, of, with, its, of, and, is, to, more, of, the, 's, for, the, say, the, had, been, on, all, the, Among, those, to, their, at, the, which, at, are, US, and, and, will, be, who, were, in, the, 's, those, who, with, the, to, the, A, former, the, after, being, is, among, those, due, to, the, I, 'm, to, be, The, of, 's, the, the, of, his, was, both, and]


# Lemmatization
Reduced form or root word is called lemma.
Changes tense, number, etc.

In [106]:
for token in doc:
    print(f"{token.text:<20}  {token.lemma_:<20}")

Queen                 Queen               
Elizabeth             Elizabeth           
II                    II                  
made                  make                
personal              personal            
additions             addition            
to                    to                  
plans                 plan                
for                   for                 
her                   her                 
funeral               funeral             
day                   day                 
,                     ,                   
Buckingham            Buckingham          
Palace                Palace              
has                   have                
said                  say                 
.                     .                   

                     
                   
Among                 among               
the                   the                 
touches               touch               
requested             request             
by         

# Word Frequency

In [107]:
from collections import Counter 

words = [token.text for token in doc if not token.is_stop and not token.is_punct and not token.text == '\n']

word_freq = Counter(words)

common_words = word_freq.most_common(10)
print(common_words)

[('Queen', 5), ('funeral', 5), ('said', 3), ('state', 3), ('personal', 2), ('Palace', 2), ('Monday', 2), ('events', 2), ('service', 2), ('President', 2)]


In [108]:
# Unique words
unique_words = {word for (word, freq) in word_freq.items() }
print(unique_words)

{'piper', 'Emmanuel', 'Elizabeth', 'awarded', 'role', 'starts', 'Buckingham', '11:00', 'including', 'Palace', 'choices', 'touches', 'biggest', 'Birthday', '200', 'reflect', 'Earl', 'requested', 'response', 'organiser', 'Gledhill', 'leaders', 'silence', 'held', 'Norfolk', 'Macron', 'police', 'times', 'service', 'events', 'official', 'consulted', 'additions', '84', 'attend', 'single', 'Alongside', 'royalty', 'Honours', 'draws', 'readings', 'BST', 'moved', 'ceremonial', 'officer', 'War', 'day', 'Westminster', 'plans', 'said', 'choice', 'aides', 'minute', 'Britain', 'pandemic', 'World', 'Tony', 'shot', 'French', 'lament', 'arrangements', 'confirm', 'politicians', 'expected', 'Cross', 'George', 'playing', 'state', 'attendance', 'people', 'Biden', 'personal', 'helped', 'humbling', 'order', 'daunting', 'staged', 'President', '15', 'music', 'close', 'likely', 'Marshal', 'incredibly', 'Duke', 'midday', 'II', 'Joe', 'coronavirus', 'funeral', 'national', 'world', 'involved', 'Monday', 'Abbey', 'Q

In [109]:
# Without reviewing stopwords
words = [token.text for token in doc ]

word_freq = Counter(words)

common_words = word_freq.most_common(10)
print(common_words)

[('the', 17), (',', 12), ('.', 11), ('\n', 10), ('to', 8), ('of', 7), ('Queen', 5), ('funeral', 5), ('is', 4), ('be', 4)]


# POS Tagging

Eight parts of speech:
1. Noun
2. Pronoun
3. verb
4. Adjective
5. Adverb
6. Preposition
7. COnjunction
8. Interjection

* pos_ : Coarse grained POS
* tag_ : Fine grained POS

In [110]:
for token in doc:
    print(f"{token.text :<20} {token.pos_:<20} {token.tag_:<20} {token.head.text:<20} {spacy.explain(token.pos_):<40} {spacy.explain(token.tag_)}")

Queen                PROPN                NNP                  II                   proper noun                              noun, proper singular
Elizabeth            PROPN                NNP                  II                   proper noun                              noun, proper singular
II                   PROPN                NNP                  made                 proper noun                              noun, proper singular
made                 VERB                 VBD                  said                 verb                                     verb, past tense
personal             ADJ                  JJ                   additions            adjective                                adjective (English), other noun-modifier (Chinese)
additions            NOUN                 NNS                  made                 noun                                     noun, plural
to                   ADP                  IN                   made                 adposition         

# Extracting category of words

In [111]:
nouns = []
adjectives = []
for token in doc:
    if token.pos_ == 'NOUN':
        nouns.append(token)
    if token.pos_ == 'ADJ':
        adjectives.append(token)

print(nouns, adjectives)

[additions, plans, day, touches, playing, lament, piper, state, funeral, events, minute, silence, service, close, midday, order, service, choice, music, readings, choices, funeral, aides, arrangements, attendance, state, funeral, royalty, politicians, world, leaders, people, response, coronavirus, police, officer, times, state, funeral, organiser, events, role] [personal, funeral, likely, biggest, single, ceremonial, national, more, personal, French, pandemic, former, due, official]


# Visualisation

In [112]:
from spacy import displacy
simple_text = "He is interested to learn Natural Language Processing"
simple_doc = nlp(simple_text)
# displacy.serve(simple_doc, style="dep")
displacy.render(simple_doc, style="dep", jupyter = True)


In [113]:
dep_tags = ["nsubj","acomp", "xcomp", "aux", "dobj"]

for tag in dep_tags:
    print(tag, " : ", spacy.explain(tag))

nsubj  :  nominal subject
acomp  :  adjectival complement
xcomp  :  open clausal complement
aux  :  auxiliary
dobj  :  direct object


In [114]:
'''
ORTH : Exact text of the token 
SHAPE : Transforms the token string to orthographic features
OP : operators. Using ? indicates that the pattern is optional.
'''
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

roger_fed_text = "Roger Federer whose phoe number is (123) 456 789 retired from professional tennis yesterday."

def extract_phone_number(nlp_doc): 
    pattern = [
        {'ORTH': '('}, {'SHAPE': 'ddd'},
        {'ORTH': ')'}, {'SHAPE': 'ddd'},
        {'ORTH': '-', 'OP': '?'},
        {'SHAPE': 'ddd'}
    ]
    matcher.add('PHONE_NUMBER', [pattern], on_match=None)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text 

rf_doc = nlp(roger_fed_text)
extract_phone_number(rf_doc)

'(123) 456 789'

# Shallow Processing / Chunking

In [115]:
text = "AWS Annual Conference Re:Invent happens every year in Las Vegas around mid November."

doc = nlp(text)
for chunk in doc.noun_chunks:
    print(chunk)

Annual Conference
Invent
Las Vegas
mid November


In [116]:
text = '''This park has everything you could ask for: a big hill for kids to roll down, an off-leash dog area (which is always full of adorable puppies), tennis courts, and sprawling green lawns with great views of the CN Tower. I take my son to the playground near the west end and sunbathe on the grass nearby. It’s big enough that all aspects of society are accommodated comfortably; frat boys playing ball games on one side, families on another, hippies and drum circles tend to gather near the south end by Queen st. It’s a fantastic place to appreciate Toronto’s cultural diversity.'''
doc = nlp(text)
for chunk in doc.noun_chunks:
    print(chunk)

This park
everything
you
a big hill
kids
-leash
which
adorable puppies
tennis courts
green lawns
great views
the CN Tower
I
my son
the playground
the west end
the grass
It
all aspects
society
frat boys
ball games
one side
families
another
hippies
drum circles
the south end
Queen st
It
a fantastic place
Toronto’s cultural diversity


# Verb Phrase Detection

In [117]:
%pip install textacy

You should consider upgrading via the '/Users/sauravbhattacharyya/Desktop/DataScience2022/venv/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [118]:
import textacy
text = '''This park has everything you could ask for: a big hill for kids to roll down, an off-leash dog area (which is always full of adorable puppies), tennis courts, and sprawling green lawns with great views of the CN Tower. I take my son to the playground near the west end and sunbathe on the grass nearby. It’s big enough that all aspects of society are accommodated comfortably; frat boys playing ball games on one side, families on another, hippies and drum circles tend to gather near the south end by Queen st. It’s a fantastic place to appreciate Toronto’s cultural diversity.'''


pattern = r'(<VERB>+)'
doc = textacy.make_spacy_doc(text, "en_core_web_sm")

# TBD...
# for chunk in textacy.extract.regex_matches(doc, pattern):
#     print(chunk.text)

# for chunk in doc.noun_chunks:
#     print(chunk)



# NER

In [119]:
text = '''Roger Federer, James Brooks, Rafael Nadal and Venus Williams will be reckoned as the torch bearer of tennis in the early 21st century. Maria Sharapova belongs to the earlier generation.'''

doc = nlp(text)
for ent in doc.ents:
    print(f"{ent.text:<30} {ent.start_char} {ent.end_char} {ent.label_:<30} {spacy.explain(ent.label_)}")

Roger Federer                  0 13 PERSON                         People, including fictional
James Brooks                   15 27 PERSON                         People, including fictional
Rafael Nadal                   29 41 PERSON                         People, including fictional
Venus Williams                 46 60 PERSON                         People, including fictional
the early 21st century         111 133 DATE                           Absolute or relative dates or periods
Maria Sharapova                135 150 PERSON                         People, including fictional


In [120]:
displacy.render(doc, style='ent')

# REDACTING

In [121]:
from dataclasses import replace


text = '''Roger Federer, James Brooks, Rafael Nadal and Venus Williams will be reckoned as the torch bearer of tennis in the early 21st century. Maria Sharapova belongs to the earlier generation.'''

def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == 'PERSON':
        return '[REDACTED]'
    return token.text 

def redact_names(nlp_doc):
    doc = nlp_doc 
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(ent)
    tokens = map(replace_person_names, nlp_doc)
    return ' '.join(tokens)

doc = nlp(text)
redact_names(doc)

'[REDACTED] , [REDACTED] , [REDACTED] and [REDACTED] will be reckoned as the torch bearer of tennis in the early 21st century . [REDACTED] belongs to the earlier generation .'

# SEMANTIC SIMILARITY & WORD VECTORS

In [122]:
nlp = spacy.load("en_core_web_md")

doc1 = nlp("What a bright morning")
doc2 = nlp("What a bright morning to start with")

doc1.similarity(doc2)

0.8325487248355983

# PIPELINE

In [123]:
nlp = spacy.load("en_core_web_sm")
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [124]:
from spacy.language import Language

@Language.component("my_custom_component")
def custom_component(doc):
    print(f"Ther are {len(doc)} tokens in the text.")
    return doc


In [125]:
nlp.add_pipe("my_custom_component")

nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'my_custom_component']

In [126]:
doc = nlp("Where the mind is without fear and the head is held high")

Ther are 12 tokens in the text.


In [131]:
from spacy.language import Language
from spacy.tokens import Span
from spacy.util import filter_spans

nlp = spacy.load("en_core_web_sm")

@Language.component("titled_person")
def titled_person(doc):
    pattern = [ 
        {"IS_ALPHA" : True, "IS_TITLE": True},
        {"IS_STOP": True},
        {"IS_ALPHA" : True, "IS_TITLE": True}
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add("TITLED PERSON", [pattern])

    matches = matcher(doc)
    matched_spans = [Span(doc, start, end, label="PERSON") for _, start, end in matches]

    filtered_matches = filter_spans(list(doc.ents) + matched_spans)
    doc.ents = filtered_matches

    return doc 

nlp.add_pipe("titled_person")

<function __main__.titled_person(doc)>

In [135]:
text = '''Roger Federer, James Brooks, Rafael Nadal & Venus Williams will be reckoned as the torch bearer of tennis in the early 21st century. Maria Sharapova belongs to the earlier generation.'''

doc = nlp(text)

In [136]:
doc.ents

(Roger Federer,
 James Brooks,
 Rafael Nadal & Venus Williams,
 the early 21st century,
 Maria Sharapova)

In [134]:
nlp.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'titled_person']