## AIT526 - Individual Lab 3

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
import spacy
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from spacy import displacy

In [3]:
# 1.1 - Web Scraping Technique
def get_content_from_page(url):
    
    final_page_text = ""
    
    page_response = requests.get(url)
    soup_response = BeautifulSoup(page_response.content, "lxml")
    final_content = soup_response.find(id="content")
    pars = final_content.find_all("p")
    
    for p in pars:
        final_page_text += p.text
        
    return final_page_text.lower()

URL = 'https://en.wikipedia.org/wiki/Russian_invasion_of_Ukraine'
content = get_content_from_page(URL)
content



'\n\nnortheastern ukraine campaign\neastern ukraine campaign\n\nsouthern ukraine campaign\nother regions\nnaval operations\nspillover and cross-border incidents\nresistance\npossibly related\nattacks on civilians\non 24 february 2022, russia invaded and occupied parts of ukraine in a major escalation of the russo-ukrainian war, which had begun in 2014. the invasion has resulted in tens of thousands of deaths on both sides, and instigated europe\'s largest refugee crisis since world war ii. about 8 million ukrainians were displaced within their country by june, and more than 8.1 million had fled the country by march 2023.\nfor months before the invasion, russian troops massed around ukraine\'s borders while russian officials repeatedly denied plans to attack ukraine. on 24 february 2022, russian president vladimir putin announced a "special military operation" to support the russian-controlled breakaway republics of donetsk and luhansk, whose military forces had been fighting ukraine in

In [4]:
nlp = spacy.load('en_core_web_sm')

article_txt = nlp(content)

# 1.2.1 - Count all the named entities in the document  
print(len(article_txt.ents))

# The various entities present in the article are :
entity_labels = [x.label_ for x in article_txt.ents]
Counter(entity_labels)

1436


Counter({'DATE': 291,
         'GPE': 275,
         'CARDINAL': 184,
         'LOC': 16,
         'EVENT': 3,
         'MONEY': 10,
         'NORP': 278,
         'PERSON': 138,
         'TIME': 14,
         'ORG': 149,
         'ORDINAL': 36,
         'LANGUAGE': 1,
         'FAC': 3,
         'PERCENT': 6,
         'QUANTITY': 27,
         'LAW': 1,
         'PRODUCT': 3,
         'WORK_OF_ART': 1})

In [5]:
# 1.2.2 - Count the most frequent tokens for the entire document  

regular_exp = re.compile('^-?\\d*(\\.\\d+)?$')

article_tokens = []

for c in article_txt:
    #print(c.text)
    if regular_exp.match(c.text) is None: # should not match the given regular expression
        if not(nlp.vocab[c.text].is_stop): # should not be a stop word
            if not(nlp.vocab[c.text].is_punct): # should not be punctuation
                article_tokens.append(c.text)
                
Counter(article_tokens).most_common(10)

[('russian', 263),
 ('ukrainian', 159),
 ('ukraine', 129),
 ('russia', 126),
 ('forces', 88),
 ('military', 60),
 ('reported', 58),
 ('invasion', 57),
 ('march', 53),
 ('said', 53)]

In [6]:
# 1.2.3
# Printing K+3 sentences

import random  # to pick a random number

no_of_sentences = [s for s in article_txt.sents]
len(no_of_sentences)

max_sen_num = len(no_of_sentences) - 3 #cannot use last 2 numbers to print 3 sentences
chosen_sentences = []

K = random.randrange(max_sen_num)
print('The chosen random number is: ', K, '\n\n')
print('The sentences are: ', '\n')
for num in range(K, K+3):
    print('Sentence number', num, 'is: \n', no_of_sentences[num],'\n')
    chosen_sentences.append (no_of_sentences[num])

The chosen random number is:  1 


The sentences are:  

Sentence number 1 is: 
 the invasion has resulted in tens of thousands of deaths on both sides, and instigated europe's largest refugee crisis since world war ii. 

Sentence number 2 is: 
 about 8 million ukrainians were displaced within their country by june, and more than 8.1 million had fled the country by march 2023.
 

Sentence number 3 is: 
 for months before the invasion, russian troops massed around ukraine's borders while russian officials repeatedly denied plans to attack ukraine. 



In [7]:
# 1.2.4 - POS and Lemmatization of the above chosen sentences
for each_sent in chosen_sentences:
    for each_token in each_sent:
        if each_token.pos_ != 'PUNCT' and not each_token.is_stop:
            print(each_token.orth_, each_token.pos_, each_token.lemma_)

invasion NOUN invasion
resulted VERB result
tens NOUN ten
thousands NOUN thousand
deaths NOUN death
sides NOUN side
instigated VERB instigate
europe PROPN europe
largest ADJ large
refugee NOUN refugee
crisis NOUN crisis
world PROPN world
war PROPN war
ii PROPN ii
8 NUM 8
million NUM million
ukrainians NOUN ukrainian
displaced VERB displace
country NOUN country
june PROPN june
8.1 NUM 8.1
million NUM million
fled VERB flee
country NOUN country
march PROPN march
2023 NUM 2023

 SPACE 

months NOUN month
invasion NOUN invasion
russian ADJ russian
troops NOUN troop
massed VERB mass
ukraine PROPN ukraine
borders NOUN border
russian ADJ russian
officials NOUN official
repeatedly ADV repeatedly
denied VERB deny
plans NOUN plan
attack VERB attack
ukraine PROPN ukraine


In [8]:
# 1.2.5 - Entity Notation 
for each_sent in chosen_sentences:
    for entity_notation in each_sent.ents:
        print(entity_notation.text, '-->', entity_notation.label_)

tens of thousands --> CARDINAL
europe --> LOC
world war ii --> EVENT
about 8 million --> CARDINAL
june --> DATE
more than 8.1 million --> MONEY
march 2023 --> DATE
months --> DATE
russian --> NORP
ukraine --> GPE
russian --> NORP


In [9]:
# 1.2.6 - Visualizing the entities and dependencies of the Kth sentence
displacy.render(chosen_sentences[2], style="ent")

In [10]:
# Dependencies
displacy.render(chosen_sentences[2], style="dep")

In [11]:
# 1.2.7 - Visualizing all the entities in the document
displacy.render(article_txt, style='ent')

In [12]:
# 2.1 Part I - Deidentification

# Deidentifying the PERSON label
def deidentify_person(content):
    deidentification_res = []
    for x in nlp(content):
        if x.ent_type_ == 'PERSON':
            deidentification_res.append("[REDACTED]")
        else:
            deidentification_res.append(x.text)
    
    final_res = " ".join(deidentification_res)
    return final_res   

In [13]:
res_without_person = deidentify_person(content)
print(res_without_person)



 northeastern ukraine campaign 
 eastern ukraine campaign 

 southern ukraine campaign 
 other regions 
 naval operations 
 spillover and cross - border incidents 
 resistance 
 possibly related 
 attacks on civilians 
 on 24 february 2022 , russia invaded and occupied parts of ukraine in a major escalation of the russo - ukrainian war , which had begun in 2014 . the invasion has resulted in tens of thousands of deaths on both sides , and instigated europe 's largest refugee crisis since world war ii . about 8 million ukrainians were displaced within their country by june , and more than 8.1 million had fled the country by march 2023 . 
 for months before the invasion , russian troops massed around ukraine 's borders while russian officials repeatedly denied plans to attack ukraine . on 24 february 2022 , russian president [REDACTED] [REDACTED] announced a " special military operation " to support the russian - controlled breakaway republics of donetsk and luhansk , whose military fo

In [14]:
# 2.1 part II - Visualizing the result from 2.1


displacy.render(nlp(res_without_person), style='ent')