# Text Summarization

In [2]:
# imports
import sys
import os
import numpy as np
import pandas as pd
import sqlite3
import json
import datetime

from gensim.summarization import summarize

## Model

TextRank Summarizer:
Summarizing is based on ranks of text sentences using a variation of the TextRank algorithm. The output summary will consist of the most representative sentences 

## Dataset

For this experiment lets pull in the news articles for the last few days

In [3]:
# Connect to database
database_url = "../datastore/app_data.db"
database = sqlite3.connect(database_url)

sql = "select * from articles"
source_data = pd.read_sql_query(sql, database)

print("Shape:",source_data.shape)
source_data.head()

Shape: (166, 7)


Unnamed: 0,id,source,article_link,article_date,article_title,article_content,article_dts
0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0
1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0
2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0
3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0
4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0


In [4]:
source_data['word_count'] = source_data['article_content'].str.split().str.len()

# View some metrics of data
print("Number of Article:",f'{source_data.shape[0]:,}')
print("Minimum Article Date:",datetime.datetime.fromtimestamp(min(source_data['article_dts'])).strftime("%b %d %Y"))
print("Maximum Article Date:",datetime.datetime.fromtimestamp(max(source_data['article_dts'])).strftime("%b %d %Y"))
print("Minimum Word Count:",min(source_data['word_count']))
print("Maximum Word Count:",f'{max(source_data["word_count"]):,}')

Number of Article: 166
Minimum Article Date: Dec 23 2019
Maximum Article Date: Jan 05 2020
Minimum Word Count: 111
Maximum Word Count: 5,195


In [16]:
url = source_data[source_data["word_count"] > 2000]["article_link"].tolist()
url

['https://www.npr.org/2019/12/31/792350952/the-top-moments-from-a-decade-that-reshaped-american-politics',
 'https://edition.cnn.com/2013/07/04/us/donald-trump-fast-facts/index.html']

## Data Preprocessing

The data preprocessing steps that we will follow inorder to feed the data to the model are:
- Combine Title with Blog Content
- Remove line breaks

In [17]:
# Merge title with content
source_data['text'] = source_data['article_title'] + " " + source_data["article_content"]

# Convert column to str
source_data['text'] = source_data['text'].apply(str)

# Replace line breaks
article_text = source_data['text'].str.replace("\n", " ")

source_data['text'] = article_text

In [18]:
print("Shape:",source_data.shape)
source_data.head()

Shape: (166, 9)


Unnamed: 0,id,source,article_link,article_date,article_title,article_content,article_dts,word_count,text
0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577146000.0,276,Trump Downplays Threat Of 'Gift' From North Ko...
1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059000.0,603,Ukraine Emails Fuel Democrats' Call For Impeac...
2,2b767e199bd897158dd7f8b999bef7aa592b82fd4548eb...,nymag,http://nymag.com/intelligencer/2019/12/matt-sh...,"Dec. 24, 2019",GOP Lawmaker Plotted Insurrections to Establis...,Shea’s rebellion. Photo: Ted S Warren/AP/Shutt...,1577146000.0,806,GOP Lawmaker Plotted Insurrections to Establis...
3,139c45cf3296a8e4f8bf50d3525b808be1620b3b670778...,nymag,http://nymag.com/intelligencer/2020/01/iran-ge...,"Jan. 2, 2020",U.S. Kills Iranian General Qasem Suleimani in ...,A Shiite Muslim pilgrim walks with a bag adorn...,1577923000.0,1059,U.S. Kills Iranian General Qasem Suleimani in ...
4,f99ee44a2f210564c2eb3bc91a781f36dd6de44142047a...,nymag,http://nymag.com/intelligencer/2019/12/a-gloss...,"Dec. 31, 2019","A Glossary of Everyday Things, According to Trump",Donald Trump. Photo: Chip Somodevilla/Getty Im...,1577750000.0,578,"A Glossary of Everyday Things, According to Tr..."


### Generate Summary

In [24]:
summary_ratio = 0.2
def generate_summary(text):
    return summarize(text, ratio=summary_ratio)

source_data['text_word_count'] = source_data['text'].str.split().str.len()
source_data['summary_text'] = source_data['text'].apply(generate_summary)
source_data['summary_word_count'] = source_data['summary_text'].str.split().str.len()

In [25]:
print("Shape:",source_data.shape)
source_data[["text","text_word_count","summary_text","summary_word_count"]].head()

Shape: (166, 12)


Unnamed: 0,text,text_word_count,summary_text,summary_word_count
0,Trump Downplays Threat Of 'Gift' From North Ko...,289,Trump Downplays Threat Of 'Gift' From North Ko...,70
1,Ukraine Emails Fuel Democrats' Call For Impeac...,612,Ukraine Emails Fuel Democrats' Call For Impeac...,151
2,GOP Lawmaker Plotted Insurrections to Establis...,814,"Photo: Ted S Warren/AP/Shutterstock Last year,...",296
3,U.S. Kills Iranian General Qasem Suleimani in ...,1070,U.S. Kills Iranian General Qasem Suleimani in ...,333
4,"A Glossary of Everyday Things, According to Tr...",586,Photo: Chip Somodevilla/Getty Images It’s no s...,139


## Model Evaluation

We will visually look at a few of the articles to see how the model performed

In [28]:
# View some results
print(source_data.loc[0]['text'])
print("---------------------------------------")
print(source_data.loc[0]['summary_text'])

Trump Downplays Threat Of 'Gift' From North Korea: Maybe It's A 'Beautiful Vase' President Trump did not seem concerned Tuesday when asked about the threat of a "Christmas present" from North Korea if the U.S. doesn't roll back economic sanctions on the country by the end of the year. "Maybe it's a nice present," Trump told reporters at an event at his Mar-a-Lago resort in Florida. "Maybe it's a present where he sends me a beautiful vase, as opposed to a missile test." Pyongyang imposed an end-of-year deadline for concessions from the U.S. earlier this month, but the Trump administration has given no sign that it plans to give in to the pressure campaign. The U.S. wants North Korea to give up its entire nuclear arsenal before removing the sanctions. Dealing with North Korea's nuclear ambitions has been one of Trump's top foreign policy priorities, and he has held a series of meetings with North Korea's Kim Jong Un to try to negotiate a solution. World North Korea Promises A Christmas S

In [30]:
# View some results
print(source_data.loc[3]['text'])
print("---------------------------------------")
print(source_data.loc[3]['summary_text'])

---------------------------------------
U.S. Kills Iranian General Qasem Suleimani in Airstrike at Baghdad Airport A Shiite Muslim pilgrim walks with a bag adorned with a portrait of Qasem Suleimani, Iran’s Islamic Revolutionary Guard Corps (IRGC), Major General and commander of the Quds Force, who was killed on Friday morning in Baghdad.
The attack on Suleimani, considered Iran’s most important military leader, was carried out “at the direction of the president.” Suleimani was the head of Iran’s Quds Force, a unit of the Islamic Revolutionary Guard Corps responsible for clandestine operations abroad and for providing material support to groups like Hezbollah, Hamas, and the Popular Mobilization Forces, an umbrella organization for Iraqi militias, whose deputy leader Abu Mahdi al-Muhandis was also reportedly killed in the attack, along with five others.
The game has changed.” Shortly after the death of Suleimani, reports also emerged that U.S. marines captured two leaders of pro-Irania

In [31]:
# View some results
print(source_data.loc[6]['text'])
print("---------------------------------------")
print(source_data.loc[6]['summary_text'])

Please Stop Telling Miners to Learn to Code Photo: Spencer Platt/Getty Images Good news, coal miners: Joe Biden has a brilliant idea for your future. “Anybody who can go down 3,000 feet in a mine can sure as hell learn to program as well … Anybody who can throw coal into a furnace can learn how to program, for God’s sake!” the former vice-president said at a New Hampshire rally on Monday. Biden riffing on how Obama put him in charge of judging the "jobs of the future" suggests re-training miners as coders."Anybody who can go down 3000 feet in a mine can sure as hell learn to program as well."This sort of "just transition" stuff was murder on Clinton in 2016...— Dave Weigel (@daveweigel) December 30, 2019 God only knows where Biden got the idea that coal mining consists of throwing the stuff into a furnace. That’s not how it works, but I digress. Biden’s recommendation is stale stuff. It’s the kind of rhetoric that will only sway voters whose ideal president is a machine that spits out 