# Text Summarization

In [20]:
# imports
import sys
import os
import numpy as np
import pandas as pd
import sqlite3
import json
import datetime
import spacy

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

## Model

Word Frequencies based summarization:
A simple and robust method to find the number of times a word has been used in a sentence. Words with higher count are assumed to be more important. Sentences with importent words are slected for the final extractive summary

## Dataset

For this experiment lets pull in the news articles for the last few days

In [6]:
# Connect to database
database_url = "../datastore/app_data.db"
database = sqlite3.connect(database_url)

sql = "select * from articles limit 2"
source_data = pd.read_sql_query(sql, database)

print("Shape:",source_data.shape)
source_data.head()

Shape: (2, 7)


Unnamed: 0,id,source,article_link,article_date,article_title,article_content,article_dts
0,7c66bfc6f7b115ac9ea1c443d64d9f662a3c7257d06d2a...,npr,https://www.npr.org/2019/12/24/791102803/trump...,"December 24, 2019",Trump Downplays Threat Of 'Gift' From North Ko...,President Trump did not seem concerned Tuesday...,1577145600
1,d5e50fa5f13830087bedc86232317ea1790d2417d4d729...,npr,https://www.npr.org/2019/12/23/790747698/newly...,"December 23, 2019",Ukraine Emails Fuel Democrats' Call For Impeac...,Party leaders in Congress continued to spar Mo...,1577059200


In [7]:
source_data['word_count'] = source_data['article_content'].str.split().str.len()

# View some metrics of data
print("Number of Article:",f'{source_data.shape[0]:,}')
print("Minimum Article Date:",datetime.datetime.fromtimestamp(min(source_data['article_dts'])).strftime("%b %d %Y"))
print("Maximum Article Date:",datetime.datetime.fromtimestamp(max(source_data['article_dts'])).strftime("%b %d %Y"))
print("Minimum Word Count:",min(source_data['word_count']))
print("Maximum Word Count:",f'{max(source_data["word_count"]):,}')

Number of Article: 2
Minimum Article Date: Dec 23 2019
Maximum Article Date: Dec 24 2019
Minimum Word Count: 276
Maximum Word Count: 603


## Data Preprocessing

The data preprocessing steps that we will follow inorder to feed the data to the model are:
- Combine Title with Blog Content
- Remove line breaks

In [8]:
# Custom stop words
custom_stopwords_file ='../datastore/custom_stopwords.txt'
custom_stopwords_df = pd.read_csv(custom_stopwords_file, header=None)
print("Shape:",custom_stopwords_df.shape)
custom_stopwords = custom_stopwords_df[0].tolist()

Shape: (6, 1)


In [25]:
# Utilities to perfrom data cleaning and preparation

nlp = spacy.load('en', disable=['parser', 'ner'])

# function to remove stopwords
def remove_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

def remove_custom_stopwords(rev):
    rev_new = " ".join([i for i in rev if i not in custom_stopwords])
    return rev_new

def lemmatization(texts, tags=['NOUN', 'ADJ']):
    output = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        output.append([token.lemma_ for token in doc if token.pos_ in tags])
    return output

In [26]:
# Merge title with content
source_data['text'] = source_data['article_title'] + " " + source_data["article_content"]

# Convert column to str
source_data['text'] = source_data['text'].apply(str)

# Replace line breaks
article_text = source_data['text'].str.replace("\n", " ")

# remove unwanted characters and symbols
article_text = article_text.str.replace("[^a-zA-Z#.]", " ")

# Remove double spaces
article_text = article_text.str.replace("  ", " ")
article_text = article_text.str.replace("  ", " ")
article_text = article_text.str.replace("  ", " ")

# make entire text lowercase
article_text = [r.lower() for r in article_text]

# remove stopwords from the text
article_text = [remove_stopwords(r.split()) for r in article_text]

# Remove custom stopwords
article_text = [remove_custom_stopwords(r.split()) for r in article_text]

# # Tokenize
# tokenized_text = pd.Series(article_text).apply(lambda x: x.split())
# # Lemmatize
# tokenized_text = lemmatization(tokenized_text)

# # Remove custom stopwords
# tokenized_text = remove_custom_stopwords(tokenized_text)

# tokenized_text
article_text

['trump downplays threat gift north korea maybe beautiful vase president trump seem concerned tuesday asked threat christmas present north korea u.s. roll back economic sanctions country end year. maybe nice present trump told reporters event mar lago resort florida. maybe present sends beautiful vase opposed missile test. pyongyang imposed end year deadline concessions u.s. earlier month trump administration given sign plans give pressure campaign. u.s. wants north korea give entire nuclear arsenal removing sanctions. dealing north korea nuclear ambitions one trump top foreign policy priorities held series meetings north korea kim jong un try negotiate solution. world north korea promises christmas surprise. options unclear exactly christmas gift north korea threatening npr geoff brumfiel laid options monday including launching rocket payload space conducting underground nuclear test testing long range missile capable reaching united states territories. tuesday trump elaborate u.s. wo

In [None]:
def _create_dictionary_table(text_string) -> dict:
   
    #removing stop words
    stop_words = set(stopwords.words("english"))
    
    words = word_tokenize(text_string)
    
    #reducing words to their root form
    stem = PorterStemmer()
    
    #creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table