<h1>Text Summarization using TextRank Algorithm</h1>

In [1]:
#Loading required packages
import pandas as pd
import numpy as np
import nltk
import re
import string

from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
data = pd.read_pickle("contents.pkl")
data.head()

Unnamed: 0_level_0,title,original_content
number,Unnamed: 1_level_1,Unnamed: 2_level_1
Article 1,Pier 1 appoints interim CFO amid growing finan...,\n\nPier 1 on Wednesday reported that fourth q...
Article 2,Family Dollar to close nearly 400 stores,\nDollar Tree on Wednesday announced that up t...
Article 3,Having to share personal data turns consumers ...,"\nAccording to a new Harris Poll survey, 71% o..."
Article 4,Walgreens taps Narvar for online pickup return...,\n\np.p1 {margin: 0.0px 0.0px 35.0px 0.0px; li...
Article 5,TechStyle claims more than 5M active members,"\n\nTechStyle Fashion Group, which operates Sh..."


In [3]:
#Tokenize the sentences using NLTK
from nltk.tokenize import sent_tokenize
token = []
for sent in data['original_content']:
    token.append(sent_tokenize(sent))
        
data['sentence_tokens'] = token

In [4]:
'''
Data Cleaning - Round 1
#Remove tags, whitespaces and special characters
'''
def clean_text1(doc):
    token1 = []
    token2 = []
    
    for i in doc:
        token1 = re.sub(r'\{[^)]*\}', '', i) #Removes tags
        token1 = re.sub(r'\s+', ' ', token1) #Removes whitespaces
        token1 = re.sub(r'^.p', '', token1) #Removes extra first character 'p' in specific rows
        token1 = re.sub(r'^.p1', '', token1) #Removes extra first character 'p' in specific rows
        token1 = token1.lstrip()
        token2.append(token1)
    return token2

data['sentence_tokens'] = data['sentence_tokens'].apply(lambda x: clean_text1(x))

In [5]:
data['sentence_tokens'][23]

['Ann Taylor, LOFT, Lou & Grey, Ann Taylor Factory and LOFT Outlet — which together make up Ascena’s premium apparel group — Tuesday launched ALL Rewards, a loyalty program without a membership fee that for the first time enables clients to earn and redeem perks across all five brands.',
 'The program includes customers without the retailers\' credit cards, although card members earn five points for every dollar spent, while non-credit card members earn two points per dollar spent "on qualifying purchases across all five brands," according to a company press release.',
 'All loyalty members get $5 for every 500 points they earn, which they can use at any Ann Taylor, LOFT, Lou & Grey, Ann Taylor Factory or LOFT Outlet store or website in the U.S. and Puerto Rico.',
 'Other ALL Rewards include events and promotions, choose-your-own bonus points days, a birthday gift and other exclusive offers.',
 'Credit card holders also get additional benefits like "free shipping with a $75 online qual

In [6]:
'''
Data Cleaning - Round 2
#Remove whitespaces, punctuations and digits
'''
def clean_text2(doc):
    token1 = []
    token2 = []
    
    for i in doc:
        token1 = re.sub(r'\w*\d\w*', '', i) #Remove numbers
        token1 = re.sub(r'[%s]'% re.escape(string.punctuation), '', token1)#Removes punctuations
        token1 = re.sub(r'\s+', ' ', token1) #Removes whitespaces
        token1 = token1.lower()
        token2.append(token1)
    return token2

data['clean_sentence_tokens'] = data['sentence_tokens'].apply(lambda x: clean_text2(x))
data.head()

Unnamed: 0_level_0,title,original_content,sentence_tokens,clean_sentence_tokens
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Article 1,Pier 1 appoints interim CFO amid growing finan...,\n\nPier 1 on Wednesday reported that fourth q...,[Pier 1 on Wednesday reported that fourth quar...,[pier on wednesday reported that fourth quarte...
Article 2,Family Dollar to close nearly 400 stores,\nDollar Tree on Wednesday announced that up t...,[Dollar Tree on Wednesday announced that up to...,[dollar tree on wednesday announced that up to...
Article 3,Having to share personal data turns consumers ...,"\nAccording to a new Harris Poll survey, 71% o...","[According to a new Harris Poll survey, 71% of...",[according to a new harris poll survey of amer...
Article 4,Walgreens taps Narvar for online pickup return...,\n\np.p1 {margin: 0.0px 0.0px 35.0px 0.0px; li...,[Customer experience platform Narvar and Walgr...,[customer experience platform narvar and walgr...
Article 5,TechStyle claims more than 5M active members,"\n\nTechStyle Fashion Group, which operates Sh...","[TechStyle Fashion Group, which operates ShoeD...",[techstyle fashion group which operates shoeda...


In [7]:
data['clean_sentence_tokens'][0]

['pier on wednesday reported that fourth quarter net sales fell to million from million in the yearago quarter',
 'comp store sales in the quarter decreased which the company attributes partially to the shift in holiday selling days not included in the fourth quarter',
 'for fiscal year the company reported net sales fell to billion from billion year over year',
 'the company’s longterm debt stands at million up from million in the yearago period\u200b the company also reported a net loss of million which includes a million transformation cost related to professional fees and severance costs from a net income of million in the yearago period according to a company press release',
 'gross profit for the quarter totaled million or of net sales from million or of net sales in the yearago quarter',
 'pier also appointed deborah riegerpaganis as interim cfo effective immediately replacing nancy walsh who also served as evp and principal financial officer according to a separate press releas

In [8]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

# remove stopwords from the sentences
data['clean_sentence_tokens'] = data['clean_sentence_tokens'].apply(lambda x: [remove_stopwords(r.split()) for r in x])

In [9]:
data['clean_sentence_tokens'][1]

['dollar tree wednesday announced family dollar stores close',
 'company closed family dollar stores fourth quarter',
 'retailer also reported consolidated net sales week fourth quarter fell billion billion previous year weeks',
 'consolidated net sales week fiscal increased billion billion week fiscal year',
 'enterprise samestore sales quarter rose samestore sales dollar tree banner rose constant currency basis samestore sales family dollar banner rose according company press release',
 'additionally dollar tree reported billion gross profit compared billion prior years week quarter',
 'gross margin decreased compared year year',
 'company stated decline due higher markdowns including million sku rationalization markdown family dollar along shrink domestic freight costs distribution costs occupancy costs']

### Vector Representation of Sentences

In [10]:
# Extract word vectors
word_embeddings = {}
f = open('GloVe/glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [11]:
def sentence_vector(clean_sentences):
    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
        else:
            v = np.zeros((100,))
        sentence_vectors.append(v)
    return sentence_vectors
    
data['sentence_vectors'] = [sentence_vector(x) for x in data['clean_sentence_tokens']]

### Similarity Matrix Preparation

In [12]:
# similarity matrix
def similarity_matrix(sentences, sentence_vectors):
    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]
    return sim_mat        

data['sim_matrix'] = [similarity_matrix(x,y) for x,y in zip(data['sentence_tokens'], data['sentence_vectors'])]

### Applying TextRank Algorithm

In [13]:
def text_summarize(similarity_matrix, sentences):
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    
    summary = []
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse = True)
    for i in range(2):
        s = ranked_sentences[i][1]
        summary.append(s)
    return summary

data['summary'] = [text_summarize(x,y) for x,y in zip(data['sim_matrix'], data['sentence_tokens'])]
data['summary'] = data['summary'].apply(' '.join)

In [14]:
data['summary'][14]

'Net income for the quarter increased 3.1% to $214.7 million, compared to $208.2 million in the prior year. erating income rose 10.5% to $281.2 million, or 13.2% of net sales, compared to $254.4 million in the year-ago quarter.'

In [15]:
data = data[['title', 'original_content', 'sentence_tokens', 'summary']]
data.head()

Unnamed: 0_level_0,title,original_content,sentence_tokens,summary
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Article 1,Pier 1 appoints interim CFO amid growing finan...,\n\nPier 1 on Wednesday reported that fourth q...,[Pier 1 on Wednesday reported that fourth quar...,"For fiscal year 2019, the company reported net..."
Article 2,Family Dollar to close nearly 400 stores,\nDollar Tree on Wednesday announced that up t...,[Dollar Tree on Wednesday announced that up to...,"Additionally, Dollar Tree reported $1.9 billio..."
Article 3,Having to share personal data turns consumers ...,"\nAccording to a new Harris Poll survey, 71% o...","[According to a new Harris Poll survey, 71% of...",The survey also found 58% of Americans are les...
Article 4,Walgreens taps Narvar for online pickup return...,\n\np.p1 {margin: 0.0px 0.0px 35.0px 0.0px; li...,[Customer experience platform Narvar and Walgr...,Those customers can select a Narvar Concierge ...
Article 5,TechStyle claims more than 5M active members,"\n\nTechStyle Fashion Group, which operates Sh...","[TechStyle Fashion Group, which operates ShoeD...",Program members are provided access to discoun...
