In [1]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_sm


2023-05-03 16:52:40.354736: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m68.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:

import spacy

nlp = spacy.load("en_core_web_sm")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
with open ("/content/drive/MyDrive/Dataset/wiki_us.txt", "r") as f:
  text = f.read()

In [6]:
print(text)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j] At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d] The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world. The national capital is Washington, D.C., and the most populous city is New York.

Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century. The United States emerged from the thirteen British colonies est

In [12]:
doc = nlp(text)

In [43]:
print(len(text))
print(len(doc))

3521
654


In [44]:
for token in text[0:10]:
  print(token)

T
h
e
 
U
n
i
t
e
d


In [45]:
for token in doc[0:10]:
  print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [None]:
#SpaCy automatically detects characters with period like U.S.A.
#SpaCy removed the punctuation mark like the parenthesis (U.S.A. or USA) 
#and treated them as individual characters

In [None]:
#If we split tokens by white space separation, it would look something like this

In [46]:
for token in text.split()[:10]:
  print (token)

The
United
States
of
America
(U.S.A.
or
USA),
commonly
known


In [None]:
# Sentence Boundary Detection (SBD) in NLP 
# is identification of sentences ina text.
# In english, abbreviation of word is also
#demarcated with period punctuation.
# So we can't just split and demarcate a sentence
# just separated by period.

In [47]:
for sent in doc.sents:
  print(sent)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.
It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j]
At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d]
The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22]
With a population of more than 331 million people, it is the third most populous country in the world.
The national capital is Washington, D.C., and the most populous city is New York.


Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century.
The United States emerged from the thirteen British colonies es

In [48]:
sentence1 = list(doc.sents)[0]
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [49]:
token2 = sentence1[2]
print (token2)

States


In [50]:
#To extract text from token and not the token object
token2.text

'States'

In [51]:
#token2 object has attributes
#To get the left most token corresponding to token2
token2.left_edge

The

In [52]:
token2.right_edge

America

In [53]:
token2.ent_type

384

In [54]:
#The string corresponding to 384 is
token2.ent_type_

'GPE'

In [None]:
#GPE means Geo Political Entity

In [55]:
#iob is a specific kind of Named Entity code
#'I' tells that the word States is inside of a large entity.
token2.ent_iob_

'I'

In [56]:
#Lemma form or root form of the word
token2.lemma_

'States'

In [57]:
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [24]:
print(sentence1[12])

known


In [25]:
#Lemma form is uninflected verb form of known
sentence1[12].lemma_

'know'

In [26]:
#morphological form of token2 is Singular
token2.morph

Number=Sing

In [27]:
#pos is part of speech and PROPN means proper noun
token2.pos_

'PROPN'

In [28]:
#what role it plays in sentence or dependency relation, here noun subject
token2.dep_

'nsubj'

In [29]:
#Language from doc object
token2.lang_

'en'

In [30]:
#Morphological form of sentence1 is Perfect Past Participle
sentence1[12].morph

Aspect=Perf|Tense=Past|VerbForm=Part

In [9]:
#Parts of speech in SpaCy and dependency parser
#Basic semantics of SpaCy
txt = "Mike enjoys playing football."
doc2 = nlp(txt)
print (doc2)

Mike enjoys playing football.


In [32]:
for token in doc2:
  print(token.text, token.pos_, token.dep_)

Mike PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj
. PUNCT punct


In [10]:
#To visually see the structure of doc2
from spacy import displacy
displacy.render(doc2, style="dep")

'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="5662cad356c147d9936bc4dd910acd8e-0" class="displacy" width="750" height="224.5" direction="ltr" style="max-width: none; height: 224.5px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="50">Mike</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">PROPN</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="225">enjoys</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">VERB</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="400">playing</tspan>\n    <tspan class="displ

In [13]:
#Named Entity Recognition or NER
#How to iterate over doc object
for ent in doc.ents:
  print(ent.text, ent.label_)

The United States of America GPE
U.S.A. GPE
USA GPE
the United States GPE
U.S. GPE
US GPE
America GPE
North America LOC
50 CARDINAL
five CARDINAL
326 CARDINAL
Indian NORP
3.8 million square miles QUANTITY
9.8 million square kilometers QUANTITY
fourth ORDINAL
The United States GPE
Canada GPE
Mexico GPE
Bahamas GPE
Cuba GPE
more than 331 million CARDINAL
third ORDINAL
Washington GPE
D.C. GPE
New York GPE
Paleo-Indians NORP
Siberia LOC
North American NORP
at least 12,000 years ago DATE
European NORP
the 16th century DATE
The United States GPE
thirteen CARDINAL
British NORP
the East Coast LOC
Great Britain GPE
the American Revolutionary War ORG
the late 18th century DATE
U.S. GPE
North America LOC
Native Americans NORP
1848 DATE
the United States GPE
United States GPE
the second half of the 19th century DATE
the American Civil War ORG
Spanish NORP
World War EVENT
U.S. GPE
World War II EVENT
the Cold War EVENT
the United States GPE
the Korean War EVENT
the Vietnam War EVENT
the Soviet Union

In [2]:
#SpaCy, Python and ML
import spacy

In [3]:
#Installing spacy medium model as it has word vectors stored within it
!python -m spacy download en_core_web_md

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [4]:
#Word vectors or word embeddings are numerical representations of words in multidimensional space through matrices
nlp = spacy.load("en_core_web_md")

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
with open ("/content/drive/MyDrive/Dataset/wiki_us.txt", "r") as f:
  text = f.read()

In [8]:
#creating doc object
doc = nlp(text)
sentence1 = list(doc.sents)[0]
print(sentence1)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.


In [9]:
#How to use word vectors with spacy
#Say how the word "country" is similar with other words in our model

import numpy as np
your_word = "country"

ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[your_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
print(words)


['country—0,467', 'nationâ\x80\x99s', 'countries-', 'continente', 'Carnations', 'pastille', 'бесплатно', 'Argents', 'Tywysogion', 'Teeters']


In [10]:
#Checking similarity between two sentences
doc1 = nlp("I like cheeseburgers and fries.")
doc2 = nlp("Fast food tastes very good.")

print (doc1, "<->", doc2, doc1.similarity(doc2))

I like cheeseburgers and fries. <-> Fast food tastes very good. 0.6667528705068858


In [20]:
cheeseburgers = doc1[2]
fries = doc1[4]

print(cheeseburgers, "<->", fries, cheeseburgers.similarity(fries))

cheeseburgers <-> fries 0.734847903251648


In [14]:
#Because of word embedding, apple and banana are in similar cluster of fruit

doc3 = nlp("I like oranges.")
doc4 = nlp("I like apples.")

print (doc3, "<->", doc4, doc3.similarity(doc4))

I like oranges. <-> I like apples. 0.9787321635750837


In [None]:
#Sample Spacy Pipeline for NER

#Input Sentence -> Entity Ruler -> Entity Linker -> Output (Sentence with Entities Annotated)

In [2]:
#We will make a blank spacy pipeline
#Creating blank spacy pipeline
import spacy
nlp = spacy.blank("en")

In [3]:
#Sentencizer lets you implement a simpler, rule-based strategy that doesn’t require a statistical model to be loaded
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f4a2caeb0c0>

In [4]:
#analyzing pipeline
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []},
  'doc.sents': {'assigns': ['sentencizer'], 'requires': []}}}

In [7]:
#We will now create a more robust pipeline
nlp2 = spacy.load("en_core_web_sm")

In [8]:
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  'tagger': [],
  'parser': [],
  'attribute_ruler': [],
  'lemmatizer': [],
  'ner': []},
 'att

In [None]:
#Rule based spacy
#2 ways to add custom features to spacy language pipeline
#1- Rule based approach 2- Machine Learning Approach


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "West Chestertenfieldville was referenced in Mr. Deeds."

In [2]:
doc = nlp(text)

In [3]:
#Finding the label - We expect the label to be wrong so that we can use ruler to fix it
for ent in doc.ents:
  print (ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [4]:
ruler = nlp.add_pipe("entity_ruler")

In [5]:
#entity_ruler is added
nlp.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [12]:
#we will add pattern to the pipeline
patterns = [
    {"label": "TV Show", "pattern": "Mr. Deeds"}
]

In [13]:
ruler.add_patterns(patterns)

In [14]:
#ner comes before entity_ruler in pipeline. Thats why the label is not changed
doc2 = nlp(text)
for ent in doc2.ents:
  print(ent.text, ent.label_)

West Chestertenfieldville GPE
Deeds PERSON


In [15]:
#We have to put our ruler before ner pipeling to reflect the change
nlp2 = spacy.load("en_core_web_sm")

ruler = nlp2.add_pipe("entity_ruler", before="ner")

ruler.add_patterns(patterns)

In [16]:
doc = nlp2(text)

for ent in doc.ents:
  print (ent.text, ent.label_)

West Chestertenfieldville GPE
Mr. Deeds TV Show


In [18]:
#We can see Mr. Deeds is now correctly labelled to TV Show
#Lets look at nlp2 pipe
nlp2.analyze_pipes()

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'entity_ruler': {'assigns': ['doc.ents', 'token.ent_type', 'token.ent_iob'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ent

In [None]:
#Now we see that our entity_ruler is set before ner moodel

In [None]:
#Spacy Matcher

#The key difference between entity ruler and a matcher is how data is extracted

#Entity ruler is used when the thing we try to extract is something that has a label corresponds to it within the entity

#We will use matcher when something is not an entity type but something that is a structure
#within a text that will help us extract information




In [7]:
import spacy
from spacy.matcher import Matcher

In [8]:
#Load small english model
nlp = spacy.load("en_core_web_sm")

In [4]:
#We will try to find email address from text
#Say we want to extract everything looking like an email
matcher = Matcher(nlp.vocab)
pattern = [{"LIKE_EMAIL":True}]
matcher.add("EMAIL_ADDRESS", [pattern])

In [5]:
doc = nlp("This is an email address: sarhal@yahoo.com")
matches = matcher(doc)

In [6]:
#We get a list containing set of tuples with 3 indices
#Index 0 is Lexeme, then start token, then end token
print(matches)

[(16571425990740197027, 6, 7)]


In [7]:
#We will go to NLP vocab where the integer lies and find what it corresponds to
#Label we have given is added to nlp vocab with unique lexeme
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
with open ("/content/drive/MyDrive/Dataset/wiki_mlk.txt", "r") as f:
  text = f.read()

In [3]:
print(text)

Martin Luther King Jr. (born Michael King Jr.; January 15, 1929 – April 4, 1968) was an American Baptist minister and activist who was one of the most prominent leaders in the civil rights movement from 1955 until his assassination in 1968. A Black church leader and a son of early civil rights activist and minister Martin Luther King Sr., King advanced civil rights for people of color in the United States through nonviolence and civil disobedience. Inspired by his Christian beliefs and the nonviolent activism of Mahatma Gandhi, he led targeted, nonviolent resistance against Jim Crow laws and other forms of discrimination in the United States.

King participated in and led marches for the right to vote, desegregation, labor rights, and other civil rights.[1] He oversaw the 1955 Montgomery bus boycott and later became the first president of the Southern Christian Leadership Conference (SCLC). As president of the SCLC, he led the unsuccessful Albany Movement in Albany, Georgia, and helped

In [9]:
#We will create a new model and grab all proper nouns
import spacy
nlp = spacy.load("en_core_web_sm")

In [10]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN"}]
matcher.add("PROPER_NOUN", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
  print(match, doc[match[1]:match[2] ])

#The lexxim with token extracted is shown

111
(451313080118390996, 0, 1) Martin
(451313080118390996, 1, 2) Luther
(451313080118390996, 2, 3) King
(451313080118390996, 3, 4) Jr.
(451313080118390996, 6, 7) Michael
(451313080118390996, 7, 8) King
(451313080118390996, 8, 9) Jr.
(451313080118390996, 10, 11) January
(451313080118390996, 15, 16) April
(451313080118390996, 23, 24) Baptist


In [11]:
#We can make the token multi word
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PROPN", "OP": "+"}]
matcher.add("PROPER_NOUN", [pattern], greedy="LONGEST")
doc = nlp(text)
matches = matcher(doc)
print(len(matches))
for match in matches[:10]:
  print(match, doc[match[1]:match[2] ])

66
(451313080118390996, 62, 67) Martin Luther King Sr.
(451313080118390996, 511, 516) Martin Luther King Jr. Day
(451313080118390996, 566, 571) Martin Luther King Jr. Memorial
(451313080118390996, 0, 4) Martin Luther King Jr.
(451313080118390996, 157, 161) Southern Christian Leadership Conference
(451313080118390996, 6, 9) Michael King Jr.
(451313080118390996, 238, 241) Civil Rights Act
(451313080118390996, 244, 247) Voting Rights Act
(451313080118390996, 252, 255) Fair Housing Act
(451313080118390996, 314, 317) J. Edgar Hoover


In [1]:
#Spacy Custom Components - changes the doc object along the spacy pipeline
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp("Britain is a place. Mary is a doctor.")

In [2]:
for ent in doc.ents:
  print (ent.text, ent.label_)

Britain GPE
Mary PERSON


In [3]:
#Say we want LOC label instead of the GPE label

from spacy.language import Language

@Language.component("remove_gpe")

def remove_gpe(doc):
  original_ents = list(doc.ents)
  for ent in doc.ents:
    if ent.label_ == "GPE":
      original_ents.remove(ent)
  doc.ents = original_ents
  return (doc)

In [4]:
nlp.add_pipe("remove_gpe")

<function __main__.remove_gpe(doc)>

In [5]:
nlp.analyze_pipes() #remove_gpe pipe would be added

{'summary': {'tok2vec': {'assigns': ['doc.tensor'],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'tagger': {'assigns': ['token.tag'],
   'requires': [],
   'scores': ['tag_acc'],
   'retokenizes': False},
  'parser': {'assigns': ['token.dep',
    'token.head',
    'token.is_sent_start',
    'doc.sents'],
   'requires': [],
   'scores': ['dep_uas',
    'dep_las',
    'dep_las_per_type',
    'sents_p',
    'sents_r',
    'sents_f'],
   'retokenizes': False},
  'attribute_ruler': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False},
  'lemmatizer': {'assigns': ['token.lemma'],
   'requires': [],
   'scores': ['lemma_acc'],
   'retokenizes': False},
  'ner': {'assigns': ['doc.ents', 'token.ent_iob', 'token.ent_type'],
   'requires': [],
   'scores': ['ents_f', 'ents_p', 'ents_r', 'ents_per_type'],
   'retokenizes': False},
  'remove_gpe': {'assigns': [],
   'requires': [],
   'scores': [],
   'retokenizes': False}},
 'problems': {'tok2vec': [],
  

In [6]:
#Testing the change in pipeline

doc = nlp("Britain is a place. Mary is a doctor.")

for ent in doc.ents:
  print(ent.text, ent.label_)

Mary PERSON
