# Named Entity Recognition
## Importing Spacy in English.

In [2]:
# run the next line only once if needed
#!python -m spacy download en_core_web_lg 
import spacy
nlp = spacy.load("en_core_web_lg")


## Getting data from website

In [3]:
from bs4 import BeautifulSoup
import html5lib
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
link1 = url_to_string('https://medium.com/wise-well/the-brain-science-behind-aging-and-forgetting-1954c5c094ab')
link2 = url_to_string('https://medium.com/@timberners_lee/marking-the-webs-35th-birthday-an-open-letter-ebb410cc7d42')
article1 = nlp(link1)
article2 = nlp(link2)
len(article1.ents)
len(article2.ents)


40

## Visualizing NERs

In [6]:
from spacy import displacy

displacy.render(article1, style='ent', jupyter=True)

In [7]:
displacy.render(article2, style='ent', jupyter=True)

## Count of each NER

In [34]:
from collections import Counter
labels1 = [x.label_ for x in article1.ents]
Counter(labels1)

Counter({'PERSON': 7,
         'ORG': 3,
         'WORK_OF_ART': 3,
         'DATE': 3,
         'LAW': 1,
         'NORP': 1})

In [33]:
labels2 = [x.label_ for x in article2.ents]
Counter(labels2)

Counter({'DATE': 11,
         'ORG': 9,
         'PERSON': 8,
         'ORDINAL': 5,
         'CARDINAL': 2,
         'GPE': 2,
         'LAW': 2,
         'NORP': 1})

In [12]:
items = [x.text for x in article1.ents]
Counter(items).most_common(5)

[('The Brain Science Behind Aging and Forgetting |', 1),
 ('Kathleen Murphy', 1),
 ('Brain Science Behind Aging', 1),
 ('ForgettingAre', 1),
 ('youKathleen Murphy·FollowPublished inWise & Well·5 min', 1)]

In [11]:
items = [x.text for x in article2.ents]
Counter(items).most_common(5)

[('35th', 3),
 ('Flanders', 2),
 ('Tim Berners-Lee', 1),
 ('Mar, 2024 | MediumOpen', 1),
 ('read·Mar 12', 1)]

In [15]:
sentences1 = [x for x in article1.sents]
print(sentences1)

[The Brain Science Behind Aging and Forgetting | by Kathleen Murphy | Wise & Well | Mar, 2024 | MediumOpen in appSign upSign inWriteSign upSign inMastodonMember-only storyThe Brain Science Behind Aging and ForgettingAre younger people smarter?, Are older people wiser?, Living longer affects the brain, but exactly how may surprise youKathleen Murphy·FollowPublished inWise & Well·5 min read·Mar 7, 2024--60SharePurchased iStock image — DrAfter123Solomon Shereshevsky, a Russian journalist in the 1920s, was known as “The Man Who Could Not Forget.”, He could effortlessly recall long lists of numbers or nonsensical information, books of poetry in languages he didn’t know, and complex scientific formulas he never learned., But his superpower came at a price., He was burdened by irrelevant data and struggled to prioritize, filter, and forget what he no longer needed., In his later years, desperate to purge his cluttered mind, Shereshevsky drank himself to death., His story serves as a cautionar

In [16]:
sentences2 = [x for x in article2.sents]
print(sentences2)

[Marking the Web’s 35th Birthday: An Open Letter | by Tim Berners-Lee | Mar, 2024 | MediumOpen in appSign upSign inWriteSign upSign inMarking the Web’s 35th Birthday: An Open LetterTim Berners-Lee·Follow5 min read·Mar 12, 2024--151ListenShareOriginal HopeThree and a half decades ago, when I invented the web, its trajectory was impossible to imagine., There was no roadmap to predict the course of its evolution, it was a captivating odyssey filled with unforeseen opportunities and challenges., Underlying its whole infrastructure was the intention to allow for collaboration, foster compassion and generate creativity — what I term the 3 C’s., It was to be a tool to empower humanity., The first decade of the web fulfilled that promise — the web was decentralised with a long-tail of content and options, it created small, more localised communities, provided individual empowerment and fostered huge value., Yet in the past decade, instead of embodying these values, the web has instead played a

In [17]:
displacy.render(nlp(str(sentences1)), jupyter=True, style='ent')

In [18]:
displacy.render(nlp(str(sentences2)), jupyter=True, style='ent')

## Type of words in the sentence

In [19]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences1)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('Brain', 'PROPN', 'Brain'),
 ('Science', 'PROPN', 'Science'),
 ('Aging', 'NOUN', 'aging'),
 ('Forgetting', 'VERB', 'forget'),
 ('|', 'VERB', '|'),
 ('Kathleen', 'PROPN', 'Kathleen'),
 ('Murphy', 'PROPN', 'Murphy'),
 ('|', 'VERB', '|'),
 ('Wise', 'PROPN', 'Wise'),
 ('&', 'CCONJ', '&'),
 ('Mar', 'PROPN', 'Mar'),
 ('2024', 'NUM', '2024'),
 ('|', 'SYM', '|'),
 ('MediumOpen', 'PROPN', 'MediumOpen'),
 ('appSign', 'NOUN', 'appsign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inWriteSign', 'PROPN', 'inWriteSign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inMastodonMember', 'X', 'inmastodonmember'),
 ('storyThe', 'DET', 'storythe'),
 ('Brain', 'PROPN', 'Brain'),
 ('Science', 'PROPN', 'Science'),
 ('Aging', 'NOUN', 'aging'),
 ('ForgettingAre', 'PROPN', 'ForgettingAre'),
 ('younger', 'ADJ', 'young'),
 ('people', 'NOUN', 'people'),
 ('smarter', 'ADJ', 'smart'),
 ('older', 'ADJ', 'old'),
 ('people', 'NOUN', 'people'),
 ('wiser', 'ADJ', 'wise'),
 ('Living', 'NOUN', 'living'),
 ('longer', 'ADV', 'long'),
 ('affects

In [20]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences2)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]


[('[', 'X', '['),
 ('Marking', 'VERB', 'mark'),
 ('Web', 'NOUN', 'web'),
 ('35th', 'ADJ', '35th'),
 ('Birthday', 'NOUN', 'birthday'),
 ('Open', 'ADJ', 'open'),
 ('Letter', 'PROPN', 'Letter'),
 ('|', 'VERB', '|'),
 ('Tim', 'PROPN', 'Tim'),
 ('Berners', 'PROPN', 'Berners'),
 ('Lee', 'PROPN', 'Lee'),
 ('|', 'VERB', '|'),
 ('Mar', 'PROPN', 'Mar'),
 ('2024', 'NUM', '2024'),
 ('|', 'SYM', '|'),
 ('MediumOpen', 'PROPN', 'MediumOpen'),
 ('appSign', 'NOUN', 'appsign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inWriteSign', 'PROPN', 'inWriteSign'),
 ('upSign', 'ADJ', 'upsign'),
 ('inMarking', 'VERB', 'inmarke'),
 ('Web', 'NOUN', 'web'),
 ('35th', 'ADJ', '35th'),
 ('Birthday', 'NOUN', 'birthday'),
 ('Open', 'ADJ', 'open'),
 ('LetterTim', 'NOUN', 'lettertim'),
 ('Berners', 'PROPN', 'Berners'),
 ('Lee·Follow5', 'PROPN', 'Lee·Follow5'),
 ('min', 'PROPN', 'min'),
 ('read·Mar', 'PROPN', 'read·Mar'),
 ('12', 'NUM', '12'),
 ('2024', 'NUM', '2024'),
 ('-151ListenShareOriginal', 'ADJ', '-151listenshareoriginal'),

## Sentence dependency tree

In [24]:
displacy.render(nlp(str(sentences1[10])), style='dep', jupyter = True, options = {'distance': 120})

In [22]:
displacy.render(nlp(str(sentences2[20])), style='dep', jupyter = True, options = {'distance': 120})

# NER in french

In [42]:
! python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
     ---------------------------------------- 0.0/16.3 MB ? eta -:--:--
      --------------------------------------- 0.2/16.3 MB 2.9 MB/s eta 0:00:06
     - -------------------------------------- 0.7/16.3 MB 5.4 MB/s eta 0:00:03
     --- ------------------------------------ 1.2/16.3 MB 7.2 MB/s eta 0:00:03
     ---- ----------------------------------- 1.9/16.3 MB 8.7 MB/s eta 0:00:02
     ------ --------------------------------- 2.7/16.3 MB 10.3 MB/s eta 0:00:02
     --------- ------------------------------ 3.7/16.3 MB 12.0 MB/s eta 0:00:02
     ----------- ---------------------------- 4.7/16.3 MB 13.2 MB/s eta 0:00:01
     -------------- ------------------------- 6.0/16.3 MB 14.8 MB/s eta 0:00:01
     ------------------ -----------------

In [43]:
nlp = spacy.load("fr_core_news_sm")

In [44]:
link3 = url_to_string('https://www.lefigaro.fr/')
article3 = nlp(link3)
len(article3.ents)

551

In [45]:
displacy.render(article3, style='ent', jupyter=True)

In [46]:
labels3 = [x.label_ for x in article3.ents]
Counter(labels3)

Counter({'LOC': 163, 'MISC': 145, 'ORG': 125, 'PER': 118})

In [47]:
items = [x.text for x in article3.ents]
Counter(items).most_common(5)

[('l’', 34), ('s’', 9), ('d’', 7), ('Ukraine', 6), ('Le Figaro', 6)]

In [48]:
sentences3 = [x for x in article3.sents]
print(sentences3)

[                                                                                                                                                                                                                         , Le Figaro, - Actualité en direct et informations en continu                                                                            , Aller au contenu                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              , Menu, VENTE, FLASH, 4,49€/mois, pendant 12 moisRechercherNouveauLe Figaro Cuisine50

In [49]:
displacy.render(nlp(str(sentences3)), jupyter=True, style='ent')

In [50]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences3)) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('                                                                                                                                                                                                                        ',
  'SPACE',
  '                                                                                                                                                                                                                        '),
 ('Figaro', 'PROPN', 'Figaro'),
 ('Actualité', 'NOUN', 'actualité'),
 ('direct', 'NOUN', 'direct'),
 ('informations', 'NOUN', 'information'),
 ('continu', 'NOUN', 'continu'),
 ('                                                                           ',
  'SPACE',
  '                                                                           '),
 ('Aller', 'VERB', 'aller'),
 ('contenu', 'NOUN', 'contenu'),
 ('                                                                                                                                  

In [51]:
displacy.render(nlp(str(sentences3[10])), style='dep', jupyter = True, options = {'distance': 120})