## ref : www.kaggle.com/sandeepbhogaraju/text-summarization-with-seq2seq-model
## ref : https://wikidocs.net/72820
## ref : https://lovit.github.io/nlp/2019/04/30/textrank/


In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Function for checking dataframe
def show_table(df, sample_num=2):
    print('>>> shape :', df.shape)
    print('>>> number of NA :', df.isna().sum().sum())
    
    if df.shape[0] <= sample_num*2:
        display(df)
    else:
        display(pd.concat([df.head(sample_num), df.tail(sample_num)]))

/kaggle/input/news-summary/news_summary_more.csv
/kaggle/input/news-summary/news_summary.csv


## Prepare Data

In [2]:
raw_df = pd.read_csv('../input/news-summary/news_summary_more.csv', encoding='iso-8859-1')
show_table(raw_df)

>>> shape : (98401, 2)
>>> number of NA : 0


Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
98399,Snoop Dogg aims gun at clown dressed as Trump ...,A new music video shows rapper Snoop Dogg aimi...
98400,Madhesi Morcha withdraws support to Nepalese g...,"Madhesi Morcha, an alliance of seven political..."


In [3]:
summary_df = pd.read_csv('../input/news-summary/news_summary.csv', encoding='iso-8859-1')
summary_df['text_concat'] = summary_df['author'].str.cat(summary_df['date'], sep=' ').str.cat(summary_df['read_more'], sep=' ').str.cat(summary_df['text'], sep=' ').str.cat(summary_df['ctext'], sep=' ')
show_table(summary_df)

>>> shape : (4514, 7)
>>> number of NA : 236


Unnamed: 0,author,date,headlines,read_more,text,ctext,text_concat
0,Chhavi Tyagi,"03 Aug 2017,Thursday",Daman & Diu revokes mandatory Rakshabandhan in...,http://www.hindustantimes.com/india-news/raksh...,The Administration of Union Territory Daman an...,The Daman and Diu administration on Wednesday ...,"Chhavi Tyagi 03 Aug 2017,Thursday http://www.h..."
1,Daisy Mowke,"03 Aug 2017,Thursday",Malaika slams user who trolled her for 'divorc...,http://www.hindustantimes.com/bollywood/malaik...,Malaika Arora slammed an Instagram user who tr...,"From her special numbers to TV?appearances, Bo...","Daisy Mowke 03 Aug 2017,Thursday http://www.hi..."
4512,Pragya Swastik,"07 Dec 2016,Wednesday","Asha Bhosle gets ?53,000 power bill for unused...",http://indiatoday.intoday.in/story/singer-asha...,The Maharashtra government has initiated an in...,Maharahstra Power Minister Chandrashekhar Bawa...,"Pragya Swastik 07 Dec 2016,Wednesday http://in..."
4513,Chhavi Tyagi,"03 Aug 2017,Thursday",More than half of India's languages may die in...,http://indiatoday.intoday.in/story/indian-lang...,At least 400 languages or more than half langu...,More than half of the languages spoken by Indi...,"Chhavi Tyagi 03 Aug 2017,Thursday http://india..."


In [4]:
pre_df = pd.DataFrame({
    'text':pd.concat([raw_df['text'], summary_df['text_concat']], ignore_index=True),
    'summary':pd.concat([raw_df['headlines'], summary_df['headlines']], ignore_index=True)
})
show_table(pre_df)

>>> shape : (102915, 2)
>>> number of NA : 118


Unnamed: 0,text,summary
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...",upGrad learner switches to career in ML & Al w...
1,Kunal Shah's credit card bill payment platform...,Delhi techie wins free food from Swiggy for on...
102913,"Pragya Swastik 07 Dec 2016,Wednesday http://in...","Asha Bhosle gets ?53,000 power bill for unused..."
102914,"Chhavi Tyagi 03 Aug 2017,Thursday http://india...",More than half of India's languages may die in...


## Perform Data Cleaning

In [5]:
# Function for removing non-alphabetic characters from string - order of regex is very important
def text_strip(input_str):
    # remove escape characters
    res_str = re.sub('(\\t)', ' ', str(input_str)).lower()
    res_str = re.sub('(\\r)', ' ', res_str)
    res_str = re.sub('(\\n)', ' ', res_str)
    
    # remove ( _ - ~ + . ) if it occurs more than one time consequently 
    res_str = re.sub('(__+)', ' ', res_str)
    res_str = re.sub('(--+)', ' ', res_str)
    res_str = re.sub('(\+\++)', ' ', res_str)
    res_str = re.sub('(\,\,+)', ' ', res_str)
    # res_str = re.sub('(~~+)', ' ', res_str)
    
    # remove certain characters
    res_str = re.sub(r'[<>()|&©ø\[\]\'\",;?~*!]', ' ', res_str)
    res_str = re.sub('(mailto:)', ' ', res_str)
    
    res_str = re.sub(r'(\\x9\d)', ' ', res_str)  # remove \x9#
    
    res_str = re.sub('([Ii][Nn][Cc]\d+)', 'INC_NUM', res_str)  # replace INC nums with INC_NUM
    res_str = re.sub('([Cc][Mm]\d+)|([Cc][Hh][Gg]\d+)', 'CM_NUM', res_str)  # replace CM# and CHG# with CM_NUM
    
    # remove ( . - : ) at end of words (not between)
    res_str = re.sub('(\.\s+)', ' ', res_str)
    res_str = re.sub('(\-\s+)', ' ', res_str)
    res_str = re.sub('(\:\s+)', ' ', res_str)
    
    res_str = re.sub('(\s+.\s)', ' ', res_str)  # remove any single characters hanging between 2 spaces
    
    # replace any url such as https://abc.xyz.net/browse/sdf-5327 ==> abc.xyz.net
    try:
        url = re.search(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', res_str)
        repl_url = url.group(3)
        res_str = re.sub(r'((https*:\/*)([^\/\s]+))(.[^\s]+)', repl_url, res_str)
    except:
        pass  # there might be emails without url in them
    
    res_str = re.sub('(\s+)', ' ', res_str)  # remove multiple spaces
    res_str = re.sub('(\s+.\s+)', ' ', res_str)  # remove any single characters hanging between 2 spaces (Should always be last)
    
    return res_str

In [6]:
%%time

brief_cleaning1 = pre_df['text'].apply(text_strip)
brief_cleaning2 = pre_df['summary'].apply(text_strip)

show_table(pd.concat([pre_df['text'], brief_cleaning1], axis=1))

>>> shape : (102915, 2)
>>> number of NA : 118


Unnamed: 0,text,text.1
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...",saurav kant an alumnus of upgrad and iiit-b pg...
1,Kunal Shah's credit card bill payment platform...,kunal shah credit card bill payment platform c...
102913,"Pragya Swastik 07 Dec 2016,Wednesday http://in...",pragya swastik 07 dec 2016 wednesday indiatoda...
102914,"Chhavi Tyagi 03 Aug 2017,Thursday http://india...",chhavi tyagi 03 aug 2017 thursday indiatoday.i...


CPU times: user 30.9 s, sys: 57.9 ms, total: 31 s
Wall time: 31 s
