# Summarization

In [1]:
import requests
from bs4 import BeautifulSoup
import os.path
from dateutil import parser
import pandas as pd
import numpy as np

In [2]:
def download_article(url):
    # check if article already there
    filename = url.split("/")[-1] + ".html"
    filename = f"./" + filename
    if not os.path.isfile(filename):
        r = requests.get(url)
        with open(filename, "w+") as f:
            f.write(r.text)
    return filename

In [3]:
def parse_article(article_file):
    with open(article_file, "r") as f:
        html = f.read()
    r = {}
    soup = BeautifulSoup(html, 'html.parser')
    #r['id'] = soup.select_one("div.StandardArticle_inner-container")['id']
    r['url'] = soup.find("link", {'rel': 'canonical'})['href']
    r['headline'] = soup.h1.text
    #r['section'] = soup.select_one("div.ArticleHeader_channel a").text
    
    r['text'] = soup.select_one("div.ArticleBodyWrapper").text
    r['authors'] = [a.text 
                    for a in soup.select("div.BylineBar_first-container.ArticleHeader_byline-bar\
                                          div.BylineBar_byline span")]
    r['time'] = soup.find("meta", { 'property': "og:article:published_time"})['content']
    return r

In [4]:
import reprlib
r = reprlib.Repr()
r.maxstring = 800

url1 = "https://www.reuters.com/article/us-qualcomm-m-a-broadcom-5g/what-is-5g-and-who-are-the-major-players-idUSKCN1GR1IN"
article_name1 = download_article(url1)
article1 = parse_article(article_name1)
print ('Article Published on', r.repr(article1['time']))
print (r.repr(article1['text']))

Article Published on '2018-03-15T11:37:01Z'
"By Eric Auchard, Stephen Nellis4 Min ReadLONDON/SAN FRANCISCO (Reuters) - U.S. President Donald Trump has blocked microchip maker Broadcom Ltd's AVGO.O $117 billion takeover of rival Qualcomm QCOM.O amid concerns that it would give China the upper hand in the next generation of mobile communications, or 5G.A 5G sign is seen at the Mobile World Congress in Barcelona, Spain February 28, 2018. REU...d 4G wireless and looks set to top the list of patent holders heading into the 5G cycle.Huawei, Nokia, Ericsson and others are also vying to amass 5G patents, which has helped spur complex cross-licensing agreements like the deal struck late last year Nokia and Huawei around handsets.Editing by Kim Miyoung in Singapore and Jason Neely in LondonOur Standards: The Thomson Reuters Trust Principles."


## Identifying Important Words with TF-IDF Values

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize

sentences = tokenize.sent_tokenize(article1['text'])
tfidfVectorizer = TfidfVectorizer()
words_tfidf = tfidfVectorizer.fit_transform(sentences)

In [6]:
# Parameter to specify number of summary sentences required
num_summary_sentence = 3

# Sort the sentences in descending order by the sum of TF-IDF values
sent_sum = words_tfidf.sum(axis=1)
important_sent = np.argsort(sent_sum, axis=0)[::-1]

# Print three most important sentences in the order they appear in the article
for i in range(0, len(sentences)):
    if i in important_sent[:num_summary_sentence]:
        print (sentences[i])

REUTERS/Yves HermanBelow are some facts about 5G and major players.WHAT IS 5G?5G networks, now in the final testing stage, will rely on denser arrays of small antennas and the cloud to offer data speeds up to 50 or 100 times faster than current 4G networks and serve as critical infrastructure for a range of industries.Deals to start building mass-market 5G networks are still largely a year away, but by 2025, 1.2 billion people are set to have access to 5G networks - a third of them in China, according to the global wireless trade group GSMA.Moving to new networks promises to enable new mobile services and even whole new business models, but could pose challenges for countries and industries unprepared to invest in the transition.Unlike the upgrades of cellular standards 2G in the early 1990s, 3G around the millennium and 4G in 2010, 5G standards will deliver not just faster phone and computer data but also help connect up cars, machines, cargo and crop equipment.WHY IS THE U.S.
WORRIED

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import tokenize

def tfidf_summary(text, num_summary_sentence):
    summary_sentence = []
    sentences = tokenize.sent_tokenize(text)
    tfidfVectorizer = TfidfVectorizer()
    words_tfidf = tfidfVectorizer.fit_transform(sentences)
    sentence_sum = words_tfidf.sum(axis=1)
    important_sentences = np.argsort(sentence_sum, axis=0)[::-1]
    for i in range(0, len(sentences)):
        if i in important_sentences[:num_summary_sentence]:
            summary_sentence.append(sentences[i])
    return summary_sentence

## LSA

In [8]:

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

from sumy.summarizers.lsa import LsaSummarizer

LANGUAGE = "english"
stemmer = Stemmer(LANGUAGE)

parser = PlaintextParser.from_string(article1['text'], Tokenizer(LANGUAGE))
summarizer = LsaSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

By Eric Auchard, Stephen Nellis4 Min ReadLONDON/SAN FRANCISCO (Reuters) - U.S. President Donald Trump has blocked microchip maker Broadcom Ltd's AVGO.O $117 billion takeover of rival Qualcomm QCOM.O amid concerns that it would give China the upper hand in the next generation of mobile communications, or 5G.A 5G sign is seen at the Mobile World Congress in Barcelona, Spain February 28, 2018.
WORRIED?The Committee on Foreign Investment in the United States (CFIUS), which vets acquisitions of U.S. corporations by foreign companies, said the Broadcom takeover risked weakening Qualcomm, which would boost China over the United States in the 5G race.Slideshow ( 2 images )Acquiring Qualcomm would represent the jewel in the crown of Broadcom’s portfolio of communications chips, which supply wi-fi, power management, video and other features in smartphones alongside Qualcomm’s core baseband chips - radio modems that wirelessly connect phones to networks.The concern is that a takeover by Singapore

In [9]:
def lsa_summary(text, num_summary_sentence):
    summary_sentence = []
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    summarizer = LsaSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

In [10]:
r.maxstring = 800
url2 = "https://www.reuters.com/article/us-usa-economy-watchlist-graphic/predicting-the-next-u-s-recession-idUSKCN1V31JE"
article_name2 = download_article(url2)
article2 = parse_article(article_name2)
print ('Article Published', r.repr(article1['time']))
print (r.repr(article2['text']))

Article Published '2018-03-15T11:37:01Z'
'By Saqib Iqbal Ahmed7 Min ReadNEW YORK A protracted trade war between China and the United States, the world’s largest economies, and a deteriorating global growth outlook has left investors apprehensive about the end to the longest expansion in American history.FILE PHOTO: Ships and shipping containers are pictured at the port of Long Beach in Long Beach, California, U.S., January 30, 2019.   ...roughton wrote in the June Cass Freight Index report.12. MISERY INDEXThe so-called Misery Index adds together the unemployment rate and the inflation rate. It typically rises during recessions and sometimes prior to downturns. It has slipped lower in 2019 and does not look very miserable.Reporting by Saqib Iqbal Ahmed; Editing by Chizu NomiyamaOur Standards: The Thomson Reuters Trust Principles.'


In [11]:

summary_sentence = tfidf_summary(article2['text'], num_summary_sentence)
for sentence in summary_sentence:
    print (sentence)

REUTERS/Mike BlakeThe recent rise in U.S.-China trade war tensions has brought forward the next U.S. recession, according to a majority of economists polled by Reuters who now expect the Federal Reserve to cut rates again in September and once more next year.Trade tensions have pulled corporate confidence and global growth to multi-year lows and U.S. President Donald Trump’s announcement of more tariffs have raised downside risks significantly, Morgan Stanley analysts said in a recent note.Morgan Stanley forecast that if the U.S. lifts tariffs on all imports from China to 25 percent for 4-6 months and China takes countermeasures, the U.S. would be in recession in three quarters.Goldman Sachs Group said on Sunday that fears of the U.S.-China trade war leading to a recession are increasing and that Goldman no longer expects a trade deal between the world’s two largest economies before the 2020 U.S. presidential election.Global markets remain on edge with trade-related headlines spurring 

In [12]:
summary_sentence = lsa_summary(article2['text'], num_summary_sentence)
for sentence in summary_sentence:
    print (sentence)

By Saqib Iqbal Ahmed7 Min ReadNEW YORK A protracted trade war between China and the United States, the world’s largest economies, and a deteriorating global growth outlook has left investors apprehensive about the end to the longest expansion in American history.FILE PHOTO: Ships and shipping containers are pictured at the port of Long Beach in Long Beach, California, U.S., January 30, 2019.
REUTERS/Mike BlakeThe recent rise in U.S.-China trade war tensions has brought forward the next U.S. recession, according to a majority of economists polled by Reuters who now expect the Federal Reserve to cut rates again in September and once more next year.Trade tensions have pulled corporate confidence and global growth to multi-year lows and U.S. President Donald Trump’s announcement of more tariffs have raised downside risks significantly, Morgan Stanley analysts said in a recent note.Morgan Stanley forecast that if the U.S. lifts tariffs on all imports from China to 25 percent for 4-6 months 

In [13]:
## Text Rank

In [14]:
from sumy.summarizers.text_rank import TextRankSummarizer

parser = PlaintextParser.from_string(article2['text'], Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

REUTERS/Mike BlakeThe recent rise in U.S.-China trade war tensions has brought forward the next U.S. recession, according to a majority of economists polled by Reuters who now expect the Federal Reserve to cut rates again in September and once more next year.Trade tensions have pulled corporate confidence and global growth to multi-year lows and U.S. President Donald Trump’s announcement of more tariffs have raised downside risks significantly, Morgan Stanley analysts said in a recent note.Morgan Stanley forecast that if the U.S. lifts tariffs on all imports from China to 25 percent for 4-6 months and China takes countermeasures, the U.S. would be in recession in three quarters.Goldman Sachs Group said on Sunday that fears of the U.S.-China trade war leading to a recession are increasing and that Goldman no longer expects a trade deal between the world’s two largest economies before the 2020 U.S. presidential election.Global markets remain on edge with trade-related headlines spurring 

In [15]:
def textrank_summary(text, num_summary_sentence):
    summary_sentence = []
    LANGUAGE = "english"
    stemmer = Stemmer(LANGUAGE)
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    summarizer = TextRankSummarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    for sentence in summarizer(parser.document, num_summary_sentence):
        summary_sentence.append(str(sentence))
    return summary_sentence

In [16]:
parser = PlaintextParser.from_string(article1['text'], Tokenizer(LANGUAGE))
summarizer = TextRankSummarizer(stemmer)
summarizer.stop_words = get_stop_words(LANGUAGE)

for sentence in summarizer(parser.document, num_summary_sentence):
    print (str(sentence))

REUTERS/Yves HermanBelow are some facts about 5G and major players.WHAT IS 5G?5G networks, now in the final testing stage, will rely on denser arrays of small antennas and the cloud to offer data speeds up to 50 or 100 times faster than current 4G networks and serve as critical infrastructure for a range of industries.Deals to start building mass-market 5G networks are still largely a year away, but by 2025, 1.2 billion people are set to have access to 5G networks - a third of them in China, according to the global wireless trade group GSMA.Moving to new networks promises to enable new mobile services and even whole new business models, but could pose challenges for countries and industries unprepared to invest in the transition.Unlike the upgrades of cellular standards 2G in the early 1990s, 3G around the millennium and 4G in 2010, 5G standards will deliver not just faster phone and computer data but also help connect up cars, machines, cargo and crop equipment.WHY IS THE U.S.
WORRIED

### use a longer page from wikipedia
https://en.wikipedia.org/wiki/Mongol_invasion_of_Europe

In [17]:
import wikipediaapi

wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

In [18]:
r.maxstring = 500

In [19]:
# https://en.wikipedia.org/wiki/Mongol_invasion_of_Europe
p_wiki = wiki_wiki.page('Mongol_invasion_of_Europe')
print (r.repr(p_wiki.text))
r.maxstring = 200

'From the 1220s into the 1240s, the Mongols conquered Volga Bulgaria, Cumania, Alania, and the Kievan Rus\' federation. They launched a two-pronged invasion of fragmented Poland, culminating in the Battle of Legnica (9 April 1241), and the Kingdom ...Citations\nSources\nSverdrup, Carl (2010). "Numbers in Mongol Warfare". Journal of Medieval Military History. Boydell Press. 8: 109–17 [p. 115]. ISBN 978-1-84383-596-7.\n\nFurther reading\nExternal links\nThe Islamic World to 1600: The Golden Horde'


In [20]:
num_summary_sentence = 5
summary_sentence = textrank_summary(p_wiki.text, num_summary_sentence)

for sentence in summary_sentence:
    print (sentence)

European tactics against Mongols The traditional European method of warfare of melee combat between knights ended in catastrophe when it was deployed against the Mongol forces as the Mongols were able to keep a distance and advance with superior numbers.
Austrian knights under Duke Frederick also fared better in fighting the Mongol invasion in Vienna.King Béla IV hired the help of the Knights of St. John, as well as training his own better-armed local knights, in preparation for the Second Mongol invasion of Hungary.
After the division of the Mongol Empire into four fragments, when the Golden Horde attempted the next invasion of Hungary, Hungary had increased their proportion of knights (led by Ladislaus IV of Hungary) and they quickly defeated the main Golden Horde Army in the hills of western Transylvania.By this time as well, many Eastern and Central European countries had ended their hostilities with one another and united to finally drive out the remnants of the Golden Horde.
An a

In [21]:
summary_sentence = lsa_summary(p_wiki.text, num_summary_sentence)
for sentence in summary_sentence:
    print (sentence)

But while the Mongols claimed control of Hungary, they could not occupy fortified cities such as Fehérvár, Veszprém, Tihany, Győr, Pannonhalma, Moson, Sopron, Vasvár, Újhely, Zala, Léka, Pozsony , Nyitra, Komárom, Fülek and Abaújvár.
Using similar tactics during their campaigns in previous Eastern and Central European countries, the Mongols first launched small squadrons to attack isolated settlements in the outskirts of Vienna in an attempt to instill fear and panic among the populace.
John Andrew Boyle asserts, based on the orthography, that Rashid Al-Din's account of the withdrawal from central Europe was taken verbatim from Mongolian records.Another theory is that weather data preserved in tree rings points to a series of warm, dry summers in the region until 1242.
In addition to calling a council to depose the Holy Roman Emperor, Pope Gregory IX and his successor Innocent IV excommunicated Frederick four times and labeled him the Antichrist.In the 1240s the efforts of Christendom 

## Keyphrase Extraction
http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/

In [22]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates

In [23]:
def score_keyphrases_by_textrank(text, percentKeywords=0.05, maxKeywords=-1):
    from itertools import takewhile, tee
    from nltk.stem import WordNetLemmatizer
  
    import networkx, nltk
    
    lemmatizer = WordNetLemmatizer()
    
    # tokenize for all words, and extract *candidate* words
    words = [lemmatizer.lemmatize(word.lower())
             for sent in nltk.sent_tokenize(text)
             for word in nltk.word_tokenize(sent)]
    candidates = extract_candidate_words(text)
    # build graph, each node is a unique candidate
    graph = networkx.Graph()
    graph.add_nodes_from(set(candidates))
    # iterate over word-pairs, add unweighted edges into graph
    def pairwise(iterable):
        """s -> (s0,s1), (s1,s2), (s2, s3), ..."""
        a, b = tee(iterable)
        next(b, None)
        return zip(a, b)
    for w1, w2 in pairwise(candidates):
        if w2:
            graph.add_edge(*sorted([w1, w2]))
    # score nodes using default pagerank algorithm, sort by score, keep top percentKeywords
    ranks = networkx.pagerank(graph)
    if 0 < percentKeywords < 1:
        percentKeywordsMaxIdx = int(round(len(candidates) * percentKeywords))
    else:
        percentKeywordsMaxIdx = int(round(len(candidates) * 0.05))
    if (maxKeywords > 0):
        percentKeywordsMaxIdx = int(min(maxKeywords,percentKeywordsMaxIdx))

    word_ranks = {word_rank[0]: word_rank[1]
                  for word_rank in sorted(ranks.items(), key=lambda x: x[1], reverse=True)[:percentKeywordsMaxIdx]}
    keywords = set(word_ranks.keys())
    # merge keywords into keyphrases
    keyphrases = {}
    j = 0
    for i, word in enumerate(words):
        if i < j:
            continue
        if word in keywords:
            kp_words = list(takewhile(lambda x: x in keywords, words[i:i+10]))
            avg_pagerank = sum(word_ranks[w] for w in kp_words) / float(len(kp_words))
            keyphrases[' '.join(kp_words)] = avg_pagerank
            # counter as hackish way to ensure merged keyphrases are non-overlapping
            j = i + len(kp_words)

    return sorted(keyphrases.items(), key=lambda x: x[1], reverse=True)

In [24]:
keyphrases = score_keyphrases_by_textrank(p_wiki.text)

In [25]:
for keyphrase in keyphrases:
    print (keyphrase[0], keyphrase[1])

mongol 0.016150487885470466
mongol invasion 0.013752261605560345
hungary 0.011659983337687415
mongol army 0.011578439619307185
invasion 0.011354035325650224
mongol horde 0.01000066561653279
second mongol invasion 0.009940985266502436
initial mongol invasion 0.009744002777358514
mongol cavalry 0.009677683828152324
many mongol 0.009568505518781568
mongol force 0.009560873509507416
mongol empire 0.009538399833783965
mongol attack 0.009511712236370103
mongol siege 0.009348026217491924
main mongol 0.009122350233135406
mongol warfare 0.009062384765429999
mongol withdrawal 0.008974719162112607
mongol raid 0.008822035866017446
europe 0.00840321496285209
mongolian invasion 0.007171855243815183
invasion force 0.0071626472295972955
khan 0.0070175102858553474
army 0.0070063913531439054
nomadic mongol cavalry 0.006949971917667504
invasion north 0.006877310900002988
second invasion 0.00683623395701842
northern hungary 0.006554301411010627
batu khan 0.006549424011681147
initial invasion 0.00654076022