In [25]:
import numpy as np
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [6]:
nlp = spacy.load("en_core_web_sm")

In [None]:
# "[0-9]+[.,]?[0-9]*"gm
# "[A-Z][a-z]+(\s[A-Z][a-z]+)*"gm

## ex1:

In [7]:
text_1 = 'Nepal’s Home Minister Ramesh Lekhak resigned Monday following the violence, Communications Minister Prithvi Subba confirmed to reporters late Monday.'

doc_1 = nlp(text_1)

In [8]:
print("Text:", text_1)
print("Raw token:", [token.text for token in doc_1])
print("No stopwwords:", [token.text for token in doc_1 if not token.is_stop])
print("No punctuation:", [token.text for token in doc_1 if not token.is_punct])
print("Lemmas:", [token.lemma_ for token in doc_1])
print("Clean tokens:", [token.lemma_.lower() for token in doc_1 if not token.is_stop and not token.is_punct])

Text: Nepal’s Home Minister Ramesh Lekhak resigned Monday following the violence, Communications Minister Prithvi Subba confirmed to reporters late Monday.
Raw token: ['Nepal', '’s', 'Home', 'Minister', 'Ramesh', 'Lekhak', 'resigned', 'Monday', 'following', 'the', 'violence', ',', 'Communications', 'Minister', 'Prithvi', 'Subba', 'confirmed', 'to', 'reporters', 'late', 'Monday', '.']
No stopwwords: ['Nepal', 'Home', 'Minister', 'Ramesh', 'Lekhak', 'resigned', 'Monday', 'following', 'violence', ',', 'Communications', 'Minister', 'Prithvi', 'Subba', 'confirmed', 'reporters', 'late', 'Monday', '.']
No punctuation: ['Nepal', '’s', 'Home', 'Minister', 'Ramesh', 'Lekhak', 'resigned', 'Monday', 'following', 'the', 'violence', 'Communications', 'Minister', 'Prithvi', 'Subba', 'confirmed', 'to', 'reporters', 'late', 'Monday']
Lemmas: ['Nepal', '’s', 'Home', 'Minister', 'Ramesh', 'Lekhak', 'resign', 'Monday', 'follow', 'the', 'violence', ',', 'Communications', 'Minister', 'Prithvi', 'Subba', 'co

In [28]:
print(re.findall(r"\w+|[^\w\s]", text_1))

['Nepal', '’', 's', 'Home', 'Minister', 'Ramesh', 'Lekhak', 'resigned', 'Monday', 'following', 'the', 'violence', ',', 'Communications', 'Minister', 'Prithvi', 'Subba', 'confirmed', 'to', 'reporters', 'late', 'Monday', '.']


## ex2:

In [None]:
text_2 = 'Nepal’s Home Minister Ramesh Lekhak resigned Monday following the violence, Communications Minister Prithvi Subba confirmed to reporters late Monday.'

doc_2 = nlp(text_2)

In [23]:
def extract_capitalized_entities(doc):
    entities = []
    current_entity = []

    for token in doc:
        if token.is_punct:
            # kết thúc cụm khi gặp dấu câu
            if current_entity:
                entities.append(" ".join(current_entity))
                current_entity = []
            continue

        # Nếu token bắt đầu bằng chữ hoa
        if token.text[0].isupper():
            current_entity.append(token.text)
        else:
            if current_entity:
                entities.append(" ".join(current_entity))
                current_entity = []

    # Nếu còn cụm dở dang ở cuối câu
    if current_entity:
        entities.append(" ".join(current_entity))

    return entities

In [24]:
extract_capitalized_entities(doc_2)

['Nepal',
 'Home Minister Ramesh Lekhak',
 'Monday',
 'Communications Minister Prithvi Subba',
 'Monday']

## ex3:

In [None]:
text_3 = '"The police are firing indiscriminately,” one protester told Indian news agency ANI. At least 17 people were killed in Kathmandu and two more in the eastern city of Itahari, according to hospital officials.'

doc_3 = nlp(text_3)

In [18]:
[token for token in doc_3 if token.text[0].isupper() or token.text[0].isdigit()]

[The, Indian, ANI, At, 17, Kathmandu, Itahari]