### Importing Libraries

In [92]:
import networkx
import rouge
import nltk
import numpy as np
import pandas as pd
from sklearn import preprocessing
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
os.chdir('../Preprocessing')
from normalization import normalize_corpus, parse_content

### Reading the Data

In [93]:
df = pd.read_csv('../Data_collection/dataset.csv')
df.head()

Unnamed: 0,URL,CATEGORY,CONTENT,SUMMARY
0,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,business,Paris/London/Atlanta: Federal Reserve Bank of ...,Paris/London/Atlanta: Federal Reserve Bank of ...
1,http://www.moneynews.com/Economy/federal-reser...,business,Severe winter weather likely affected U.S. job...,Severe winter weather likely affected U.S. job...
2,http://www.marketwatch.com/story/feds-plosser-...,business,PARISn — The Federal Reserve may have to accel...,“We must back away from increasing the degree ...
3,http://www.fxstreet.com/news/forex-news/articl...,business,FXStreet (Łódź) - Philadelphia Fed President C...,FXStreet (Łódź) - Philadelphia Fed President C...
4,http://www.iii.co.uk/news-opinion/reuters/news...,business,The value of international investments may be ...,The value of international investments may be ...


In [94]:
df['FILTERED_CONTENT'] = df['CONTENT'].apply(parse_content)
df['length'] =df['FILTERED_CONTENT'].apply(len)
df = df.loc[df['length']>15]
df.drop('length',axis =1 ,inplace =True)

In [95]:
#Taking a subset of the data:
df=df.iloc[1:1000]

df.head()


Unnamed: 0,URL,CATEGORY,CONTENT,SUMMARY,FILTERED_CONTENT
5,http://in.reuters.com/article/2014/03/10/us-ec...,business,BANGALORE (Reuters) - The European Central Ban...,The euro sign landmark is seen at the headquar...,[BANGALORE (Reuters) - The European Central Ba...
6,http://blogs.reuters.com/hugo-dixon/2014/03/10...,business,The European Union’s half-baked banking union ...,The European Union’s half-baked banking union ...,[The European Union’s half-baked banking union...
7,http://in.reuters.com/article/2014/03/10/eu-ba...,business,* Countries grapple for deal to prevent embarr...,Policymakers agreed last year that the Europea...,[* Countries grapple for deal to prevent embar...
8,http://in.reuters.com/article/2014/03/10/ecb-p...,business,"FRANKFURT, March 10 (Reuters) - The European C...","FRANKFURT, March 10 (Reuters) - The European C...","[FRANKFURT, March 10 (Reuters) - The European ..."
11,http://www.fxstreet.com/analysis/strategic-cur...,business,Outlook\n\nAttention is focused on China’s tra...,"As everyone notes repeatedly, you can’t trust ...","[Outlook, Attention is focused on China’s trad..."


In [96]:
df['FILTERED_CONTENT'] = df['FILTERED_CONTENT'].apply(normalize_corpus)
df.head()


Unnamed: 0,URL,CATEGORY,CONTENT,SUMMARY,FILTERED_CONTENT
5,http://in.reuters.com/article/2014/03/10/us-ec...,business,BANGALORE (Reuters) - The European Central Ban...,The euro sign landmark is seen at the headquar...,[bangalore reuters the european central bank s...
6,http://blogs.reuters.com/hugo-dixon/2014/03/10...,business,The European Union’s half-baked banking union ...,The European Union’s half-baked banking union ...,[the european union banking union could make w...
7,http://in.reuters.com/article/2014/03/10/eu-ba...,business,* Countries grapple for deal to prevent embarr...,Policymakers agreed last year that the Europea...,[countries grapple deal prevent embarrassing d...
8,http://in.reuters.com/article/2014/03/10/ecb-p...,business,"FRANKFURT, March 10 (Reuters) - The European C...","FRANKFURT, March 10 (Reuters) - The European C...",[frankfurt march reuters the european central ...
11,http://www.fxstreet.com/analysis/strategic-cur...,business,Outlook\n\nAttention is focused on China’s tra...,"As everyone notes repeatedly, you can’t trust ...","[outlook, attention focus china trade deficit ..."


### Text Rank Algorithm

In [97]:
def textrank_text_summarizer(data):
    
    text =data[4]
    sentences = [sent for sent in nltk.sent_tokenize(data[2])]
    summary = [sent for sent in nltk.sent_tokenize(data[3])]
    num_sentences = len(summary)
    bow_matrix = CountVectorizer().fit_transform(text)

    dt_matrix = TfidfTransformer().fit_transform(bow_matrix)

    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    
    scores = networkx.pagerank(similarity_graph)   
    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)
    try:
        top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
        top_sentence_indices.sort()
        top_sentences = [sentences[index] for index in top_sentence_indices]
        summary =''.join(top_sentences)
        return summary
    except IndexError:
        pass
    


In [98]:
df['Generated_Summary']= df.apply(textrank_text_summarizer,axis =1)
df.dropna(subset = ['Generated_Summary'],inplace=True)
df.head(10)


Unnamed: 0,URL,CATEGORY,CONTENT,SUMMARY,FILTERED_CONTENT,Generated_Summary
5,http://in.reuters.com/article/2014/03/10/us-ec...,business,BANGALORE (Reuters) - The European Central Ban...,The euro sign landmark is seen at the headquar...,[bangalore reuters the european central bank s...,BANGALORE (Reuters) - The European Central Ban...
6,http://blogs.reuters.com/hugo-dixon/2014/03/10...,business,The European Union’s half-baked banking union ...,The European Union’s half-baked banking union ...,[the european union banking union could make w...,The European Union’s half-baked banking union ...
7,http://in.reuters.com/article/2014/03/10/eu-ba...,business,* Countries grapple for deal to prevent embarr...,Policymakers agreed last year that the Europea...,[countries grapple deal prevent embarrassing d...,Failure to do so would delay the law by at lea...
8,http://in.reuters.com/article/2014/03/10/ecb-p...,business,"FRANKFURT, March 10 (Reuters) - The European C...","FRANKFURT, March 10 (Reuters) - The European C...",[frankfurt march reuters the european central ...,"FRANKFURT, March 10 (Reuters) - The European C..."
11,http://www.fxstreet.com/analysis/strategic-cur...,business,Outlook\n\nAttention is focused on China’s tra...,"As everyone notes repeatedly, you can’t trust ...","[outlook, attention focus china trade deficit ...","The PBOC target is 3.5%, by the way.Suppliers ..."
12,http://www.businessinsider.com/opening-bell-mo...,business,Good morning. Here's what you need to know.\n\...,Here's what you need to know.\nImports rose 10...,"[good morning, here need know, chinese export ...",Chinese exports fell 18.1% from a year earlier...
15,http://in.reuters.com/article/2014/03/10/euroz...,business,* Impairments and ‘hard to value’ assets key a...,As well as the initial review on whether banks...,"[impairments hard value asset key area, loan p...",As well as the initial review on whether banks...
16,http://www.rte.ie/news/business/2014/0310/6012...,business,The European Central Bank's stance on how bad ...,This is according to three sources with knowle...,[the european central bank stance bad loan def...,The European Central Bank's stance on how bad ...
18,http://in.reuters.com/article/2014/03/10/ecb-n...,business,* Noyer sees “permanent and deep forces” weigh...,"Noyer, the governor of the Bank of France, war...",[noyer see permanent deep force weigh inflatio...,* Noyer sees “permanent and deep forces” weigh...
20,http://www.marketwatch.com/story/ecbs-noyer-lo...,business,PARIS--The slow pace of price increases and no...,"""The euro area needs such adjustments in order...",[paris the slow pace price increase nominal wa...,PARIS--The slow pace of price increases and no...


#### Results

In [100]:
hypothesis = df['Generated_Summary'].tolist()
reference = df['SUMMARY'].tolist()

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg','Best']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    scores = evaluator.get_scores(hypothesis, reference)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))
    print()
    


Evaluation with Avg
	rouge-1:	P: 52.49	R: 51.07	F1: 51.13
	rouge-2:	P: 35.90	R: 35.15	F1: 35.17
	rouge-3:	P: 32.63	R: 31.95	F1: 31.97
	rouge-4:	P: 31.11	R: 30.48	F1: 30.49

Evaluation with Best
	rouge-1:	P: 52.49	R: 51.07	F1: 51.13
	rouge-2:	P: 35.90	R: 35.15	F1: 35.17
	rouge-3:	P: 32.63	R: 31.95	F1: 31.97
	rouge-4:	P: 31.11	R: 30.48	F1: 30.49

