In [37]:
import requests
import nltk
import regex as re
import numpy as np
import spacy
import spacy.cli
import requests

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

from bs4 import BeautifulSoup

from xmltodict import parse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

# HW3

## Text Processing

### Q1

1. Modify the code I wrote in lecture 8 with what you have learnt in lecture 9 and correctly tokenize the text both on the word and sentence level, and by removing the stopwords. Rewrite the `getSummary` function and all the other functions that it depends by maing these corrections.

2. Rewrite the code I wrote for `getKeywords` function making the same corrections.

3. Test your code from parts 1 and 2 on random articles from the Guardian.

4. Rewrite the `getSubjectGuardian` function for another newspaper in English, and test your code from part 1 and 2 on random articles from this new newspaper.

Setting the stop words in english

In [6]:
swEN = stopwords.words('english')

# Q1.1 & Q1.2
## Modifying the Codes
get_matrix function vectorizes the text allowing us to find the number of sentences and unique words. 

In [7]:
def get_matrix(sentences):
    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(sentences)

process_text function takes the text as input and basically modifies it. **sent_tokenize** and **word_tokenize** belong to the nltk package and tokenizers' main usage is dividing strings into lists of substrings. I split the whole text into parts of sentences and words so it gets easier working on them. 

At the end I want to obtain a summary of the text, so it would look tidy if I updated the words and sentences, making them lower case and getting rid of punctuations such as .,!?.  **re.sub** helps me achieve that

process_text returns *omats: a dictionary consisting of words and sentences that have been updated* 


In [8]:
def process_text(text):
    omats = {'sentences': sent_tokenize(text),
         'words': word_tokenize(text)}
    omats.update({'cleanedSentences': [re.sub(r'[^\p{Letter}\s]','',sentence.lower()) for sentence in omats['sentences']],
              'cleanedWords': [re.sub(r'[^\p{Letter}]','',word.lower()) for word in omats['words']]})
    return omats

get_summary is the function that summarizes the text by taking the text and k=how many sentences you want in the summary as inputs

The function starts by processing the text using the process_text function explained above. Then the stemmer operation gets activated which helped me find the root of the words in the text, that we updated. After stemming the words I also get rid of stop-words thanks to swEN

Now that my sentences and words are modified it is time to summarize the text. The method here is to find the k number of sentences in the text that have the hightest weights. Those sentences form the summary we are looking for.


In [9]:
def get_summary(text,k):
    omats=process_text(text)
    stemmer = SnowballStemmer('english')
    stemmed = [stemmer.stem(word) for word in omats['cleanedWords']]
    reduced = [word for word in stemmed if word not in swEN]

    matrix = get_matrix(omats['cleanedSentences'])
    projection = PCA(n_components=1)
    weights = projection.fit_transform(matrix.toarray())
    res = list(zip(weights.transpose()[0],range(112),omats['cleanedSentences']))
    tmp = sorted(res,key=lambda x: x[0],reverse=True)[:k]
    return sorted(tmp, key=lambda x: x[1])

I also created an url_to_text function that makes the summarization process easier. After accessing the text from the url I have noticed all the news that belong to the same newspaper have the same number of items, for The Guardian that number N is 40. Instead of giving a certain number to pick a link of an article I decided to do that randomly so the function returns different articles each time it runs.

The function returns the text that can be used later on summarization or obtaining the key words.

In [10]:
def url_to_text(url):
    with requests.get(url) as link:
        raw = parse(link.text)
        
    subject=raw['rss']['channel']['item'] 
    N=len(subject)
    n = np.random.randint(N)
    randomurl=subject[n]['link'] 

    with requests.get(randomurl) as link:
        raw = BeautifulSoup(link.content,'html.parser')
        text=' '.join([x.text for x in raw.find_all('p')])
    return(text)
    

get_key_words function takes the text, language of the stop words and k=number of key words as inputs. The whole method is similar to summarization, get_summary function. k number of words that have the highest weights are the key words so they get returned with the weight written next to them

In [11]:
def get_key_words(text,sw,k):
    omats = process_text(text)
    
    vectorizer = CountVectorizer(stop_words=sw)
    matrix = vectorizer.fit_transform(omats['sentences'])
    words = vectorizer.get_feature_names()
    
    projection = PCA(n_components=1)
    tmp = projection.fit_transform(matrix.transpose().toarray())
    weights = tmp.transpose()[0]
    
    return sorted(zip(weights,words),key=lambda x: x[0], reverse=True)[:k]

# Q1.3
## Testing
Testing the modified functions on random articles from The Guardian newsfeed. We obtain the summary and key words.

In [12]:
urlfilm='https://www.theguardian.com/film/rss'
get_summary(url_to_text(urlfilm),5)

[(5.48423739087811,
  0,
  'to celebrate her oscar nomination for the power of the dog here are dunsts star turns from gameforadventure girlsnextdoor to enigmatic and melancholy brides and widows a rare unsympathetic outing for kirsten dunst in this muchloved truelife biographical drama about the african american female mathematicians at nasa in the s whose work was vital in getting the us into space'),
 (2.2452256655000964,
  21,
  'the wholesome one was gillian armstrongs spirited and engaging version of the louisa may alcott classic in which dunst played the twinklyeyed younger amy march who was one day to grow up and marry laurie'),
 (2.6968280194923815,
  36,
  'dunsts capacity for poignancy pathos and victimhood is explored in this interesting drama based on a truecrime scandal directed by andrew jarecki who made the documentary capturing the friedmans and went on to made a tv series exploring the real case this was based on'),
 (2.1522137061397917,
  39,
  'dunst was the first a

In [13]:
get_key_words(url_to_text(urlfilm),swEN,15)

[(0.9385739551194806, 'actor'),
 (0.7102122922922125, 'appearance'),
 (0.710212292292212, 'baton'),
 (0.710212292292212, 'covid'),
 (0.710212292292212, 'desperate'),
 (0.710212292292212, 'dignified'),
 (0.710212292292212, 'eventually'),
 (0.710212292292212, 'giving'),
 (0.710212292292212, 'hand'),
 (0.710212292292212, 'interesting'),
 (0.710212292292212, 'jason'),
 (0.710212292292212, 'lucky'),
 (0.710212292292212, 'mentioned'),
 (0.710212292292212, 'momoa'),
 (0.710212292292212, 'naked')]

In [14]:
urltech='https://www.theguardian.com/technology/rss'
get_summary(url_to_text(urltech),5)

[(3.5674603390884867,
  0,
  'tech giants revenues grew at a sluggish  in the first quarter to bn as shoppers switch back to bricks and mortar amazon announced its first loss since  on thursday as sales slowed costs rose and its investment in electric vehicle company rivian wiped out profits'),
 (1.174553481463236,
  2,
  'the tech giants revenues grew at a sluggish  in the first quarter to bn amazons slowest growth rate in nearly two decades'),
 (1.8680661107683896,
  6,
  'for the current quarter amazon expects operating income between a loss of bn and a gain of bn compared with bn in second quarter of '),
 (3.5485306360447586,
  11,
  'amazon was one of the biggest winners of the pandemic recording huge jumps in sales as consumers moved to shopping online and companies turned to amazon web services aws its cloud computing unit to run their businesses'),
 (2.442792965314307,
  16,
  'the share of retail purchases made online has dropped from  in the second quarter of  to  during the 

In [15]:
get_key_words(url_to_text(urltech),swEN,15)

[(7.127470936301366, 'ipad'),
 (3.9513562167386196, 'tablet'),
 (3.759951238848723, 'apple'),
 (3.723413429832697, 'review'),
 (2.506756901637175, 'pro'),
 (2.1732536153297284, '10'),
 (1.9577540992967113, 'costs'),
 (1.680208906249131, 'stunning'),
 (1.3522747549766037, '2021'),
 (1.2623211483627148, 'fire'),
 (1.2623211483627148, 'hd'),
 (1.182930018237057, 'm1'),
 (1.0953109754554753, 'mini'),
 (0.9817256274457742, 'upgraded'),
 (0.9815055814936636, 'top')]

# Q1.4
## Extracting Text From UN News
To work on another newsfeed I decided to go with UN News. The get_subject_un function takes the name of a subject such as women, health, science... and pastes that subject into to structured url. The news with several information (title, link, describtion...) gets returned in the format of an ordered dictionary.

In [16]:
def get_subject_un(subject):
    with requests.get(f'https://news.un.org/feed/subscribe/en/news/topic/{subject}/feed/rss.xml') as link:
        raw = parse(link.text)
    return raw['rss']['channel']['item']


In [17]:
get_subject_un('women')

[OrderedDict([('title', 'Tech needs girls, and girls need tech'),
              ('link',
               'https://news.un.org/feed/view/en/story/2022/04/1117162'),
              ('description',
               'More and more young women and girls are highlighting the importance of access and safety in the digital world. Marking International Girls in ICT Day, on Thursday, UN agencies have issued a call to action to ensure equal access to digital learning opportunities.'),
              ('enclosure',
               OrderedDict([('@url',
                             'https://global.unitednations.entermediadb.net/assets/mediadb/services/module/asset/downloads/preset/Libraries/Production+Library/28-04-2022-UNICEF-UN0551726-ICT-Day.jpg/image770x420cropped.jpg'),
                            ('@length', '56418'),
                            ('@type', 'image/jpeg')])),
              ('guid',
               OrderedDict([('@isPermaLink', 'true'),
                            ('#text',
             

In [18]:
get_subject_un('health')

[OrderedDict([('title',
               'Extreme heat impacting millions across India and Pakistan'),
              ('link',
               'https://news.un.org/feed/view/en/story/2022/04/1117272'),
              ('description',
               'With extreme heat gripping large parts of India and Pakistan, the two countries are working to roll out life-saving health action plans to combat the heatwave, the World Meteorological Organization (WMO) said on Friday.'),
              ('enclosure',
               OrderedDict([('@url',
                             'https://global.unitednations.entermediadb.net/assets/mediadb/services/module/asset/downloads/preset/Libraries/Production+Library/29-04-2022-UNICEF-UN0380351-elderlywoman-India-heat.jpg/image770x420cropped.jpg'),
                            ('@length', '63510'),
                            ('@type', 'image/jpeg')])),
              ('guid',
               OrderedDict([('@isPermaLink', 'true'),
                            ('#text',
     

In order to get the summary and key words from an UN News article, I pick a random link from the ordered dictionary that will take me to the article on that particular subject. Then I transform it into a text. Sending the text to the get_summary and get_key_words functions I get returned the results with no problem.

I completed my test on women and health subjects

In [19]:
subject=get_subject_un('women')
N=len(subject)
n = np.random.randint(N)
randomurl=subject[n]['link'] 
with requests.get(randomurl) as link:
    raw = BeautifulSoup(link.content,'html.parser')
    text=' '.join([x.text for x in raw.find_all('p')])

In [20]:
get_summary(text,5)

[(1.5012413776560838,
  2,
  'the denial of education violates the human rights of women and girls\xa0 un human rights chief\n although high schools were set to open their doors to girls nationwide taliban authorities reportedly reversed the move early on wednesday pending a ruling made on the uniforms they must wear'),
 (1.826642436623655,
  10,
  'the women told her that they have information solutions and the capability to help chart a way out of this economic humanitarian and human rights crisis in afghanistan they insisted upon the equal right to quality education at the primary secondary and tertiary levels and were hopefully awaiting the reopening of schools today'),
 (4.232498168659646,
  13,
  'statement by unicefchief httpstcocflqbq as afghan citizens suffer the impacts of multiple intersecting crises the senior un official described the decision as being of grave concern disempowering half of afghanistans population is counterproductive and unjust ms bachelet said adding tha

In [21]:
get_key_words(text,swEN,15)

[(4.795481843686726, 'girls'),
 (3.86192741428134, 'education'),
 (1.347043583874491, 'rights'),
 (1.1596870829931816, 'women'),
 (1.0709204456112458, 'authorities'),
 (0.9820401035093853, 'adolescent'),
 (0.9508450648400528, 'schools'),
 (0.9323357224155904, 'ms'),
 (0.7205837901071626, 'human'),
 (0.7201263064658585, 'country'),
 (0.7164987667413226, 'denial'),
 (0.7164987667413226, 'violates'),
 (0.7159934255811642, 'de'),
 (0.7159934255811642, 'facto'),
 (0.7057797856315318, 'decision')]

In [22]:
subject=get_subject_un('health')
N=len(subject)
n = np.random.randint(N)
randomurl=subject[n]['link'] 
with requests.get(randomurl) as link:
    raw = BeautifulSoup(link.content,'html.parser')
    text=' '.join([x.text for x in raw.find_all('p')])

In [23]:
get_summary(text,5)

[(1.4647298049021389,
  0,
  'subscribe audio hub an\xa0ebola\xa0vaccination campaign has begun in northwest democratic republic of the congo drc to halt the spread of a deadly new outbreak in the country where the disease is endemic'),
 (3.392381578696105,
  2,
  'drc  is kicking off ebola vaccination todaythe vaccination is taking place in mbandaka capital city of equateur province in the northwest to halt the spread of the virus following an outbreak which has claimed two lives since  april httpstcovyrlrdk pictwittercomncdssksmy it follows the deaths of two people from ebola since  april'),
 (1.0098786558061352,
  4,
  'with effective vaccines at hand and the experience of drc health workers in ebola response we can quickly change the course of this outbreak for the better said dr matshidiso moeti the un agencys regional director for africa'),
 (6.042367832178028,
  11,
  'national health authorities are also crucial to the effort including the national institute for biomedical rese

In [24]:
get_key_words(text,swEN,15)

[(2.505923889152669, 'ebola'),
 (1.9461584765817714, 'since'),
 (1.832298644178452, 'vaccination'),
 (1.5427081582019126, '21'),
 (1.5427081582019126, 'april'),
 (1.5427081582019126, 'two'),
 (1.1236444276261104, 'mbandaka'),
 (1.0274339079053798, 'city'),
 (0.9982952379896448, 'outbreak'),
 (0.9698325722201069, 'equateur'),
 (0.9698325722201069, 'province'),
 (0.8785577646298885, 'drc'),
 (0.8734209888405879, 'lives'),
 (0.8420779504411995, 'north'),
 (0.8253452568476091, 'halt')]

### Q2

Write a function that returns all named entities (proper names, country names, corporation names only) from a URL. Function should take the URL as the input and must return the list of named entities from that URL. Test your code on random articles from the Guardian. Don't use the NLTK's NER that I demonstrated during the lecture. Use the SpaCY's NER function.

I load the spacy English tokenizer, tagger, parser and NER

In [25]:
NER = spacy.load("en_core_web_sm")

The get_names function first takes the url in rss form and picks a random article's link. Through that link the text form of the article gets created. 

Essentially, res is the article we want to work with. By putting it in calling Named Entity Recognition (NER), forming res=NER(article) we can access the name entities of the article. This would have been enough but I wanted to filter the entities that gets printed out. I only wanted to access proper names, country names, corporation names so by checking the website cited below I completed the filtering.

> https://newscatcherapi.com/blog/named-entity-recognition-with-spacy

In [26]:
def get_names(url):
    with requests.get(url) as link:
        raw = parse(link.text)
        
    subject=raw['rss']['channel']['item'] 
    N=len(subject)
    n = np.random.randint(N)
    randomurl=subject[n]['link'] 

    with requests.get(randomurl) as link:
        raw = BeautifulSoup(link.content,'html.parser')
        article=' '.join([x.text for x in raw.find_all('p')])

    
    res=NER(article)

    
    for ent in res.ents:
        if ent.label_ in ['PERSON','GPE','LOC','EVENT','FAC','LANGUAGE','LAW','NORP','WORK_OF_ART']:
            print(ent.text)
            

I tested the get_names function on The Guardian's technology, fashion and film segments.

In [28]:
urltech='https://www.theguardian.com/technology/rss'
get_names(urltech)

The Legend of Zelda, Battlefield 2042
Metal Gear Solid
Video Games Live
Los Angeles
Beijing
Sydney
London
The Legend of Zelda
David Pickard
Robert Ames
Ames
Hildur Guðnadóttir
Oscar
Joker
Sam Slater
Guðnadóttir
Ames
Yoko Shimomura
Brahms
Mahler


In [29]:
urlfashion='https://www.theguardian.com/fashion/rss'
get_names(urlfashion)

Baguette
American
British
Wrapuette
Baguette
Gucci
UK
Italian
Savile Row
Natalie Hartley
Baguette
Hartley
Baguette
Baguette
Silvia Venturini Fendi
Laird Borrelli-Persson
French
Chillie London
Wrapuette
Hartley
Gen
Jane Collins


In [31]:
urlfilm='https://www.theguardian.com/film/rss'
get_names(urlfilm)

Ivan Reitman
Hollywood
Ernst Lubitsch
Gregory La Cava
Bill Murray
Dan Aykroyd
Harold Ramis
Ivan Reitman
London
Jason
Reitman
Canadian
Meatballs
Bill Murray
Reitman
Ghostbusters and Ghostbusters II
Hollywood
Arnold Schwarzenegger
Kindergarten Cop
Arnie


### Q3

1. Write a function that returns the most positive and the most negative sentences from a text. The function must take the text as the input and must return a 2-tuple: the first element as the most positive and the second as the most negative sentence with their polarity scores.

2. Test your function on random articles from the Guardian.

# Q3.1
## Positive-Negative Function

I reused the url_to_text function created earlier to gain access to the text from its url

In [32]:
def url_to_text(url):
    with requests.get(url) as link:
        raw = parse(link.text)
        
    subject=raw['rss']['channel']['item'] 
    N=len(subject)
    n = np.random.randint(N)
    randomurl=subject[n]['link'] 

    with requests.get(randomurl) as link:
        raw = BeautifulSoup(link.content,'html.parser')
        text=' '.join([x.text for x in raw.find_all('p')])
    return(text)
    

The pos_neg function takes the text as input. Then set my analyzer as the SentimentIntensityAnalyzer from the Natural Language Toolkit (NLTK). This analyzer has a function that calculates the polarity scores.

The sent_tokenize function helps me split the test into sententes, does the tokenization. Then, the polarity score of each of those sentences get calculated. The polarity scores are positive, neutral, negative.

> The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive). 
https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/

The for loops look for the sentence that has the highest value of positive or negative polarity score, then keeps the sentence and the number i, which represents how manyth sentence it is, in order to print all the polarity scores of that sentence.

Putting the sentence along with it's polarity scores into a tuple, the final tuple with the most positive and negative sentence with it's scores gets returned.


In [33]:
def pos_neg(text):
    analyzer = SentimentIntensityAnalyzer()
    
    sentences = sent_tokenize(text)
    a=[(x,analyzer.polarity_scores(x)) for x in sentences]
    
    maxneg=a[0][1]['neg']
    maxpos=a[0][1]['pos']
    for i in range(len(a)):
        if (a[i][1]['neg']>=maxneg):
            maxneg=a[i][1]['neg']
            negsentence=a[i][0]
            negvalue=i
            
    for i in range(len(a)):
        if (a[i][1]['pos']>=maxpos):
            maxneg=a[i][1]['pos']
            possentence=a[i][0]
            posvalue=i

    tuple1=(negsentence,a[negvalue][1])
    tuple2=(possentence,a[posvalue][1])
    tuple3=(tuple1,tuple2)
    return(tuple3)

# Q3.2 
## Testing
I tested my function on random articles from The Guardian's film, economy and technology segments. I obtained the sentences with the most negative and the most positive polarity scores.

In [34]:
urlfilm='https://www.theguardian.com/film/rss'
text=url_to_text(urlfilm)
pos_neg(text)

(('It’s a painful probing of a psychological wound in her parents’ lives: the death of Quirijns’s elder sister in a drowning accident.',
  {'neg': 0.382, 'neu': 0.618, 'pos': 0.0, 'compound': -0.872}),
 ('There are points of similarity in their respective cases, and the film duly touches on them, but it sounds glib and it’s impossible not to wonder if Quirijns would not have been better off concentrating on the one story that is centrally important to her, her own, and digging deeper into that.',
  {'neg': 0.055, 'neu': 0.872, 'pos': 0.073, 'compound': -0.0023}))

In [35]:
urleco='https://www.theguardian.com/economy/rss'
text=url_to_text(urleco)
pos_neg(text)

(('Rightwing free speech absolutists celebrated the acquisition, while many on the left sounded alarm bells about the implications of one wealthy man taking control of such an influential communication platform.',
  {'neg': 0.06, 'neu': 0.628, 'pos': 0.312, 'compound': 0.875}),
 ('But we increasingly need our readers to fund our work.',
  {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}))

In [36]:
urltech='https://www.theguardian.com/technology/rss'
text=url_to_text(urltech)
pos_neg(text)

(('“For Instagram, one of the most alarming things we observed was that its algorithm is actually recommending accounts that are offering fake sanitary or vaccination services, so if you are a person who is following a couple of accounts with Covid disinformation or anti-vax content, your algorithm will recommend more accounts offering fake passes.',
  {'neg': 0.137, 'neu': 0.821, 'pos': 0.043, 'compound': -0.6697}),
 ('Exclusive: fake passes often promoted on mainstream social media platforms, study shows Anti-vaxxers in France are buying fake vaccine passes online to get around the country’s Covid restrictions, which are often promoted on mainstream social media platforms, research has revealed.',
  {'neg': 0.128, 'neu': 0.725, 'pos': 0.147, 'compound': -0.0258}))