## NER & De-Identification Using SPACY
### By RUTUJA SHINDE 

In [117]:
import spacy

In [118]:
import en_core_web_sm

In [119]:
nlp = en_core_web_sm.load()

In [120]:
from spacy import displacy

## WebScarpping of News Data

In [121]:
import bs4 as bs                                                           # BeautifulSoup
import urllib.request
import re

In [122]:
def _scrape_webpage(url):
       
    scraped_textdata = urllib.request.urlopen(url)
    textdata = scraped_textdata.read()
    parsed_textdata = bs.BeautifulSoup(textdata,'lxml')
    paragraphs = parsed_textdata.find_all('p')
    formated_text = ""

    for para in paragraphs:
        formated_text += para.text
    
    return formated_text

In [123]:
mytext = _scrape_webpage('https://www.nytimes.com/2020/03/31/world/coronavirus-live-news-updates.html?action=click&module=Spotlight&pgtype=Homepage')

In [124]:
len(mytext)                                                                #length of the web scrapped data

49391

## NER Using SPACY

In [125]:
webtext=nlp(mytext)
print(mytext[:1500])                                                #display the first 1500 words of document

AdvertisementSupported byThe scientists leading the administration’s fight estimated the virus could kill between 100,000 and 240,000 Americans. New data suggests many as 25 percent of infected people may not show symptoms.This briefing has ended. Read our global live coverage on the coronavirus pandemic here.The top government scientists battling the coronavirus estimated Tuesday that the deadly pathogen could kill between 100,000 and 240,000 Americans, in spite of the social distancing measures that have closed schools, banned large gatherings, limited travel and forced people to stay in their homes.Dr. Anthony S. Fauci, the nation’s leading infectious disease expert, and Dr. Deborah L. Birx, who is coordinating the coronavirus response, displayed that grim projection at the White House on Tuesday, calling it “our real number” but pledging to do everything possible to reduce those numbers even further.The conclusions generally match those from similar models by public health research

In [126]:
len(webtext.ents)                             #number of entities in the article

592

In [127]:
for ent in webtext.ents: 
    print(ent.text, ent.start_char, ent.end_char, ent.label_) 

between 100,000 and 240,000 106 133 CARDINAL
Americans 134 143 NORP
25 percent 171 181 PERCENT
Tuesday 376 383 DATE
between 100,000 and 240,000 420 447 CARDINAL
Americans 448 457 NORP
Anthony S. Fauci 613 629 PERSON
Deborah L. Birx 687 702 PERSON
the White House 784 799 FAC
Tuesday 803 810 DATE
Fauci 1059 1064 PERSON
Birx 1073 1077 PERSON
Americans 1128 1137 NORP
Trump 1275 1280 PERSON
Sunday 1289 1295 DATE
30 days 1309 1316 DATE
Fauci 1431 1436 PERSON
Birx 1445 1449 PERSON
Americans 1597 1606 NORP
Tuesday 1628 1635 DATE
first 1644 1649 ORDINAL
Trump 1664 1669 PERSON
the past several weeks 1823 1845 DATE
Birx 1851 1855 PERSON
Fauci 1864 1869 PERSON
Tuesday afternoon 2030 2047 TIME
at least 173,741 2049 2065 CARDINAL
Washington 2098 2108 GPE
D.C. 2110 2114 GPE
four 2120 2124 CARDINAL
U.S. 2125 2129 GPE
New York Times 2194 2208 ORG
At least 3,433 2219 2233 CARDINAL
American 2292 2300 NORP
two weeks 2389 2398 DATE
two weeks 2654 2663 DATE
Italy 2996 3001 GPE
New York 3125 3133 GPE
the nex

In [128]:
dict([(str(x),x.label_)for x in webtext.ents])          #all named entities of document and put in dictioanry

{'between 100,000 and 240,000': 'CARDINAL',
 'Americans': 'NORP',
 '25 percent': 'PERCENT',
 'Tuesday': 'DATE',
 'Anthony S. Fauci': 'PERSON',
 'Deborah L. Birx': 'PERSON',
 'the White House': 'ORG',
 'Fauci': 'PERSON',
 'Birx': 'PERSON',
 'Trump': 'PERSON',
 'Sunday': 'DATE',
 '30 days': 'DATE',
 'first': 'ORDINAL',
 'the past several weeks': 'DATE',
 'Tuesday afternoon': 'TIME',
 'at least 173,741': 'CARDINAL',
 'Washington': 'GPE',
 'D.C.': 'GPE',
 'four': 'CARDINAL',
 'U.S.': 'GPE',
 'New York Times': 'ORG',
 'At least 3,433': 'CARDINAL',
 'American': 'NORP',
 'two weeks': 'DATE',
 'Italy': 'GPE',
 'New York': 'GPE',
 'the next several days to a week or so': 'DATE',
 '100,000 to 200,000': 'CARDINAL',
 '100 percent': 'PERCENT',
 'daily': 'DATE',
 'White House': 'ORG',
 'few weeks': 'DATE',
 'three': 'CARDINAL',
 'another month': 'DATE',
 'the United States': 'GPE',
 'larger than 10': 'CARDINAL',
 'Anthony Fauci': 'PERSON',
 'the National Institute of Allergy and Infectious Diseases'

In [129]:
from collections import Counter

In [130]:
labels = [x.label_ for x in webtext.ents]                               #count each named entity in the document
Counter(labels)

Counter({'CARDINAL': 75,
         'NORP': 44,
         'PERCENT': 7,
         'DATE': 86,
         'PERSON': 135,
         'FAC': 2,
         'ORDINAL': 11,
         'TIME': 18,
         'GPE': 130,
         'ORG': 56,
         'QUANTITY': 5,
         'MONEY': 8,
         'LOC': 10,
         'LAW': 3,
         'WORK_OF_ART': 2})

In [131]:
items=[x.text for x in webtext.ents]         # 30 most frequent named entities in the document
Counter(items).most_common(30)

[('Trump', 22),
 ('Tuesday', 18),
 ('China', 13),
 ('Americans', 10),
 ('American', 10),
 ('the United States', 9),
 ('first', 7),
 ('one', 7),
 ('Iran', 7),
 ('Europe', 6),
 ('U.S.', 5),
 ('two weeks', 5),
 ('European', 5),
 ('Chris', 5),
 ('Fauci', 4),
 ('four', 4),
 ('Italy', 4),
 ('Germany', 4),
 ('Birx', 3),
 ('daily', 3),
 ('three', 3),
 ('weeks', 3),
 ('America', 3),
 ('Spain', 3),
 ('Igualada', 3),
 ('Las Vegas', 3),
 ('Congress', 3),
 ('Singapore', 3),
 ('N95', 3),
 ('Afghanistan', 3)]

In [133]:
sentences=[x for x in webtext.sents]                                    #sentence tokenization
len_sent=len(sentences)                                                 #number of total sentence tokens
import random                                                           #generate random number k
k=round(random.random()*len_sent)
k
                                                                    #print 3 consecutive sentences
first_sent=sentences[k]
print(first_sent)
second_sent=sentences[k+1]
print(second_sent)
third_sent=sentences[k+2]
print(third_sent)


In Europe, Poland is using an app to track the movements of an estimated 10,000 people who are under home quarantine because they either tested positive for coronavirus or recently returned from abroad.
Users are required to upload selfies several times a day to prove that they are following the rules of self-isolation, and any lapse in compliance results in an alert being sent to the police.
In Croatia, rights groups are pushing back against proposed legislation to monitor cellphones, saying that it would be “an unnecessary violation of human rights.


In [134]:
d=str(first_sent)+ str(second_sent)+str(third_sent)                     #store all 3 consecutive snetences in one variable
d                                                                       #after converting them to string

'In Europe, Poland is using an app to track the movements of an estimated 10,000 people who are under home quarantine because they either tested positive for coronavirus or recently returned from abroad.Users are required to upload selfies several times a day to prove that they are following the rules of self-isolation, and any lapse in compliance results in an alert being sent to the police.In Croatia, rights groups are pushing back against proposed legislation to monitor cellphones, saying that it would be “an unnecessary violation of human rights.'

In [135]:
doc = nlp(d)                                                #apply spacy function to all 3 sentences and store in variable
doc

In Europe, Poland is using an app to track the movements of an estimated 10,000 people who are under home quarantine because they either tested positive for coronavirus or recently returned from abroad.Users are required to upload selfies several times a day to prove that they are following the rules of self-isolation, and any lapse in compliance results in an alert being sent to the police.In Croatia, rights groups are pushing back against proposed legislation to monitor cellphones, saying that it would be “an unnecessary violation of human rights.

In [136]:
[(x.orth_ , x.pos_, x.lemma_)for x in [y                                #POS and Lemmatization of 3 consecutive sentences
                                      for y 
                                      in doc
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Europe', 'PROPN', 'Europe'),
 ('Poland', 'PROPN', 'Poland'),
 ('app', 'NOUN', 'app'),
 ('track', 'VERB', 'track'),
 ('movements', 'NOUN', 'movement'),
 ('estimated', 'VERB', 'estimate'),
 ('10,000', 'NUM', '10,000'),
 ('people', 'NOUN', 'people'),
 ('home', 'NOUN', 'home'),
 ('quarantine', 'NOUN', 'quarantine'),
 ('tested', 'VERB', 'test'),
 ('positive', 'ADJ', 'positive'),
 ('coronavirus', 'PROPN', 'coronavirus'),
 ('recently', 'ADV', 'recently'),
 ('returned', 'VERB', 'return'),
 ('abroad', 'ADV', 'abroad'),
 ('Users', 'NOUN', 'user'),
 ('required', 'VERB', 'require'),
 ('upload', 'VERB', 'upload'),
 ('selfies', 'NOUN', 'selfie'),
 ('times', 'NOUN', 'time'),
 ('day', 'NOUN', 'day'),
 ('prove', 'VERB', 'prove'),
 ('following', 'VERB', 'follow'),
 ('rules', 'NOUN', 'rule'),
 ('self', 'NOUN', 'self'),
 ('isolation', 'NOUN', 'isolation'),
 ('lapse', 'NOUN', 'lapse'),
 ('compliance', 'NOUN', 'compliance'),
 ('results', 'NOUN', 'result'),
 ('alert', 'NOUN', 'alert'),
 ('sent', 'VERB', '

In [137]:
from pprint import pprint

In [139]:
kth_sent=nlp(str(first_sent))
pprint([(x.text,x.label_)for x in kth_sent.ents])                                  #named entities for Kth(first) sentence 

[('Europe', 'LOC'), ('Poland', 'GPE'), ('an estimated 10,000', 'CARDINAL')]


In [140]:
displacy.render(kth_sent,jupyter=True , style='ent')                            #visualize entities for Kth(first) sentence

In [141]:
displacy.render(kth_sent,jupyter=True , style='dep', options={'distance': 100})     #visualize dependencies for Kth sentence

In [142]:
displacy.render(webtext,jupyter=True , style='ent')                         # all entities in document

## De-Identification ##

In [143]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc1 = nlp(mytext)

### Functions to replace the PERSON entity with the word [REDACTED] in the document

In [144]:
def replace_per_name(token):
    if token.ent_iob !=0 and token.ent_type_=='PERSON':
        return '[REDACTED]'
    return token.string

In [145]:
def redact_name(nlp_doc):
    for ent in nlp_doc.ents:
        ent.merge()
        tokens=map(replace_per_name, nlp_doc)
        return ''.join(tokens)

In [146]:
redact= redact_name(doc1)
print(redact)

AdvertisementSupported byThe scientists leading the administration’s fight estimated the virus could kill between 100,000 and 240,000 Americans. New data suggests many as 25 percent of infected people may not show symptoms.This briefing has ended. Read our global live coverage on the coronavirus pandemic here.The top government scientists battling the coronavirus estimated Tuesday that the deadly pathogen could kill between 100,000 and 240,000 Americans, in spite of the social distancing measures that have closed schools, banned large gatherings, limited travel and forced people to stay in their homes.Dr. [REDACTED][REDACTED][REDACTED], the nation’s leading infectious disease expert, and Dr. [REDACTED][REDACTED][REDACTED], who is coordinating the coronavirus response, displayed that grim projection at the White House on Tuesday, calling it “our real number” but pledging to do everything possible to reduce those numbers even further.The conclusions generally match those from similar mod

In [147]:
displacy.render(nlp(redact),jupyter=True , style='ent')                     #replace PERSON entity with REDACTED