## CLASSWORK - 8
## NAMED ENTITY RECOGNITION

### LOAD SPACY MODEL

Spacy is an open-source library and toolkit for Natural Language Processing (NLP) in Python. It offers pre-trained models and efficient tokenization, part-of-speech tagging, named entity recognition, and dependency parsing functionalities, making it a popular choice for NLP tasks.

In [2]:
# Import Spacy library and load the large English language model

import spacy
nlp = spacy.load("en_core_web_lg")

### TOKENIZE TEXT

In [3]:
# Tokenize the text and print each token

text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)
for token in doc:
    print(token, end=" | ")

My | best | friend | Ryan | Peters | likes | fancy | adventure | games | . | 

### DISPLAY NLP TOKENS

NLP tokens are fundamental units of text used for processing and analysis, representing individual words or subword units within a linguistic context. They serve as building blocks for various natural language processing tasks, enabling algorithms to understand and manipulate human language effectively.

In [4]:
# Function to generate a DataFrame for visualization of spaCy tokens

import pandas as pd

def display_nlp(doc, include_punct=False):
    """Generate data frame for visualization of spaCy tokens."""
    rows = []
    for i, t in enumerate(doc):
        if not t.is_punct or include_punct:
            row = {'token': i,  'text': t.text, 'lemma_': t.lemma_, 
                   'is_stop': t.is_stop, 'is_alpha': t.is_alpha,
                   'pos_': t.pos_, 'dep_': t.dep_, 
                   'ent_type_': t.ent_type_, 'ent_iob_': t.ent_iob_}
            rows.append(row)
    
    df = pd.DataFrame(rows).set_index('token')
    df.index.name = None
    return df
display_nlp(doc)

Unnamed: 0,text,lemma_,is_stop,is_alpha,pos_,dep_,ent_type_,ent_iob_
0,My,my,True,True,PRON,poss,,O
1,best,good,False,True,ADJ,amod,,O
2,friend,friend,False,True,NOUN,nsubj,,O
3,Ryan,Ryan,False,True,PROPN,compound,PERSON,B
4,Peters,Peters,False,True,PROPN,appos,PERSON,I
5,likes,like,False,True,VERB,ROOT,,O
6,fancy,fancy,False,True,ADJ,amod,,O
7,adventure,adventure,False,True,NOUN,compound,,O
8,games,game,False,True,NOUN,dobj,,O


### FILTERING STOPWORDS AND PUNCTUATION

In [5]:
# Removing stopwords and punctuation from the given text using spaCy.

text = "Dear Ryan, we need to sit down and talk. Regards, Pete"
doc = nlp(text)

non_stop = [t for t in doc if not t.is_stop and not t.is_punct]
print(non_stop)

[Dear, Ryan, need, sit, talk, Regards, Pete]


### EXTRACT NOUNS FROM TEXT

In [6]:
# Extracts nouns and proper nouns from a given text and prints them.

text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

nouns = [t for t in doc if t.pos_ in ['NOUN', 'PROPN']]
print(nouns)

[friend, Ryan, Peters, adventure, games]


### IDENTIFY ENTITIES IN TEXT

In [7]:
# Print identified entities along with their labels.

text = "My best friend Ryan Peters likes fancy adventure games."
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(Ryan Peters, PERSON) 

### IDENTIFY ENTITIES IN TEXT

In [8]:
# Print identified entities along with their labels.

text = "James O'Neill, chairman of World Cargo Inc, lives in San Francisco." 
doc = nlp(text)

for ent in doc.ents:
    print(f"({ent.text}, {ent.label_})", end=" ")

(James O'Neill, PERSON) (World Cargo Inc, ORG) (San Francisco, GPE) 

### VISUALIZE ENTITIES IN TEXT

In [9]:
# Render a visualization of the identified entities in the text.

from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

### CONVERT URL TO TEXT AND COUNT ENTITIES

In [10]:
# Convert the content of a given URL into text and count the identified entities.

from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.cnn.com/2024/03/21/us/mississippi-officers-sentencing-goon-squad-thursday/index.html')
article = nlp(ny_bb)
len(article.ents)

328

### VISUALIZE ENTITIES IN TEXT

In [11]:
# Render a visualization of the identified entities in the extracted article text.

displacy.render(article, style='ent', jupyter=True)

### COUNT ENTITY LABELS

In [12]:
# Count the occurrence of each entity label in the extracted article text.

from collections import Counter

labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 120,
         'ORG': 64,
         'DATE': 47,
         'GPE': 45,
         'CARDINAL': 24,
         'LOC': 8,
         'WORK_OF_ART': 5,
         'TIME': 5,
         'EVENT': 3,
         'FAC': 2,
         'NORP': 2,
         'PRODUCT': 1,
         'ORDINAL': 1,
         'MONEY': 1})

### COUNT MOST COMMON ENTITIES

In [13]:
# Count the most common entities in the extracted article text.

items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('CNN', 32), ('Jenkins', 15), ('Parker', 14), ('McAlpin', 12), ('Opdyke', 12)]

### PRINT FIRST SENTENCE

In [14]:
# Print the first sentence from the extracted article text.

sentences = [x for x in article.sents]
print(sentences[0])

  Rankin County, Mississippi: All 6 rogue Mississippi cops got long prison sentences in ‘Goon Squad’ torture of 2 Black men | CNN CNN values your feedback                                                         1.


### VISUALIZE ENTITIES IN FIRST SENTENCE

In [15]:
# Visualize entities in the first sentence of the extracted article text.

displacy.render(nlp(str(sentences[0])), jupyter=True, style='ent')

### EXTRACT WORDS WITH PARTS OF SPEECH AND LEMMAS

In [16]:
# Extract words along with their parts of speech and lemmas from the first sentence of the extracted article text 
# excluding stop words and punctuation.

[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[0])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('  ', 'SPACE', '  '),
 ('Rankin', 'PROPN', 'Rankin'),
 ('County', 'PROPN', 'County'),
 ('Mississippi', 'PROPN', 'Mississippi'),
 ('6', 'NUM', '6'),
 ('rogue', 'NOUN', 'rogue'),
 ('Mississippi', 'PROPN', 'Mississippi'),
 ('cops', 'NOUN', 'cop'),
 ('got', 'VERB', 'get'),
 ('long', 'ADJ', 'long'),
 ('prison', 'NOUN', 'prison'),
 ('sentences', 'NOUN', 'sentence'),
 ('Goon', 'PROPN', 'Goon'),
 ('Squad', 'PROPN', 'Squad'),
 ('torture', 'NOUN', 'torture'),
 ('2', 'NUM', '2'),
 ('Black', 'ADJ', 'black'),
 ('men', 'NOUN', 'man'),
 ('|', 'VERB', '|'),
 ('CNN', 'PROPN', 'CNN'),
 ('CNN', 'PROPN', 'CNN'),
 ('values', 'VERB', 'value'),
 ('feedback', 'NOUN', 'feedback'),
 ('                                                        ',
  'SPACE',
  '                                                        '),
 ('1', 'NUM', '1')]

### VISUALIZE DEPENDENCY PARSING

Dependency parsing is a natural language processing technique that aims to analyze the grammatical structure of a sentence by identifying relationships between words. These relationships are typically represented as directed edges between words, where one word is the dependent of another.

In [17]:
# Render a visualization of the dependency parsing for the first sentence of the extracted article text with adjusted distance between words.

displacy.render(nlp(str(sentences[0])), style='dep', jupyter = True, options = {'distance': 120})