In [None]:
!pip install spacy nltk requests
!python -m spacy download en_core_web_sm




In [2]:
import requests
import random

def fetch_random_news_article(api_key):
    sources = ['bbc-news', 'cnn', 'the-verge', 'techcrunch']
    source = random.choice(sources)
    url = f'https://newsapi.org/v2/top-headlines?sources={source}&apiKey={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        articles = response.json().get('articles')
        if articles:
            return random.choice(articles).get('content')
    return None

# Replace 'your_api_key' with your actual News API key
api_key = 'c064e9c233414a99a3e830db7eae23ac'
article = fetch_random_news_article(api_key)
print("Article:\n", article)

Article:
 Monzo has raised another £150 million ($190 million), as the challenger bank looks to expand its presence internationally particularly in the U.S.
The new round comes just two months after Monzo rai… [+1960 chars]


In [3]:
import spacy

def extract_entities_spacy(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

spacy_entities = extract_entities_spacy(article)
print("Entities from spaCy:\n", spacy_entities)

Entities from spaCy:
 [('Monzo', 'PERSON'), ('another £150 million', 'MONEY'), ('$190 million', 'MONEY'), ('U.S.', 'GPE'), ('just two months', 'DATE'), ('Monzo', 'PERSON')]


In [4]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

def extract_entities_nltk(text):
    nltk.download('punkt')
    nltk.download('maxent_ne_chunker')
    nltk.download('words')
    nltk.download('averaged_perceptron_tagger')

    sentences = nltk.sent_tokenize(text)
    entities = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        tags = nltk.pos_tag(words)
        tree = nltk.ne_chunk(tags, binary=False)
        for subtree in tree:
            if isinstance(subtree, nltk.Tree):
                entity = " ".join([word for word, tag in subtree.leaves()])
                entity_type = subtree.label()
                entities.append((entity, entity_type))
    return entities

nltk_entities = extract_entities_nltk(article)
print("Entities from NLTK:\n", nltk_entities)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Entities from NLTK:
 [('Monzo', 'GPE'), ('Monzo', 'PERSON')]


In [5]:
def compare_entities(spacy_entities, nltk_entities):
    spacy_set = set(spacy_entities)
    nltk_set = set(nltk_entities)

    common = spacy_set & nltk_set
    spacy_unique = spacy_set - nltk_set
    nltk_unique = nltk_set - spacy_set

    print("Common Entities:\n", common)
    print("\nEntities unique to spaCy:\n", spacy_unique)
    print("\nEntities unique to NLTK:\n", nltk_unique)

compare_entities(spacy_entities, nltk_entities)

Common Entities:
 {('Monzo', 'PERSON')}

Entities unique to spaCy:
 {('another £150 million', 'MONEY'), ('just two months', 'DATE'), ('$190 million', 'MONEY'), ('U.S.', 'GPE')}

Entities unique to NLTK:
 {('Monzo', 'GPE')}
