### Importing Libraries

In [1]:
import networkx
import rouge
import nltk
import numpy as np
import pandas as pd
from sklearn import preprocessing
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
os.chdir('../Preprocessing')
from normalization import normalize_corpus, parse_content

### Reading the Data

In [18]:
df = pd.read_csv('../Data_collection/dataset.csv')
df.head()

Unnamed: 0,CATEGORY,CONTENT,SUMMARY
0,business,The Federal Reserve approved Ally Financial In...,The Federal Reserve approved Ally Financial In...
1,business,— Major shareholders of Duke Energy Corp. have...,— Major shareholders of Duke Energy Corp. have...
2,business,Photos taken earlier this month show that Nort...,Photos taken earlier this month show that Nort...
3,business,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...
4,business,The energy giant says it is committed to clean...,The energy giant says it is committed to clean...


In [19]:
df['FILTERED_CONTENT'] = df['CONTENT'].apply(parse_content)
df['length'] =df['FILTERED_CONTENT'].apply(len)
df = df[df['length']>15]
df.drop('length',axis =1 ,inplace =True)

In [20]:
#Taking a subset of the data:
df=df.iloc[1:1000]

df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 3 to 1992
Data columns (total 4 columns):
CATEGORY            999 non-null object
CONTENT             999 non-null object
SUMMARY             999 non-null object
FILTERED_CONTENT    999 non-null object
dtypes: object(4)
memory usage: 39.0+ KB


In [23]:
df['FILTERED_CONTENT'] = df['FILTERED_CONTENT'].apply(normalize_corpus)
df.head()


Unnamed: 0,CATEGORY,CONTENT,SUMMARY,FILTERED_CONTENT
3,business,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...,[thanks dog report associated press know activ...
6,business,"RALEIGH, N.C., March 26 (Reuters) - Duke Energ...",In a letter to the state’s utilities commissio...,[raleigh march reuters duke energy corp say we...
9,business,"CHARLOTTE, N.C., March 26, 2014 /PRNewswire/ -...","Throughout the past few decades, we have dedic...",[charlotte march duke energy nyse duk today is...
11,business,By Suttinee Yuvejwattana and Michael Sin\n\nMa...,The Japanese satellite detected about a dozen ...,"[by suttinee yuvejwattana michael sin, march b..."
12,business,"PERTH, Australia (AP) � Planes and ships searc...","PERTH, Australia (AP) � Planes and ships searc...",[perth australia ap plane ship search debris s...


### Text Rank Algorithm

In [27]:
def textrank_text_summarizer(data):
    
    text =data[3]
    sentences = [sent for sent in nltk.sent_tokenize(data[1])]
    summary = [sent for sent in nltk.sent_tokenize(data[2])]
    num_sentences = len(summary)
    print(len(text),"----",num_sentences)
    bow_matrix = CountVectorizer().fit_transform(text)

    dt_matrix = TfidfTransformer().fit_transform(bow_matrix)

    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    
    scores = networkx.pagerank(similarity_graph)   
    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)
    try:
        top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
        top_sentence_indices.sort()
        top_sentences = [sentences[index] for index in top_sentence_indices]
        summary =''.join(top_sentences)
        return summary
    except IndexError:
        pass
    


In [None]:
df['Generated_Summary']= df.apply(textrank_text_summarizer,axis =1)
df.dropna(subset = ['Generated_Summary'],inplace=True)
df.head(10)


18 ---- 5
17 ---- 5
39 ---- 5
19 ---- 5
19 ---- 5
23 ---- 5
31 ---- 5
27 ---- 5
24 ---- 5
23 ---- 5
29 ---- 5
19 ---- 5
43 ---- 5
45 ---- 5
23 ---- 5
18 ---- 5
21 ---- 5
17 ---- 5
36 ---- 5
27 ---- 5
16 ---- 5
23 ---- 5
32 ---- 5
22 ---- 5
42 ---- 5
21 ---- 5
16 ---- 5
42 ---- 5
48 ---- 5
33 ---- 5
45 ---- 5
50 ---- 5
37 ---- 5
18 ---- 5
24 ---- 5
30 ---- 5
25 ---- 5
44 ---- 5
30 ---- 5
18 ---- 5
19 ---- 5
23 ---- 5
34 ---- 5
35 ---- 5
37 ---- 5
41 ---- 5
27 ---- 5
47 ---- 5
20 ---- 5
43 ---- 5
50 ---- 5
44 ---- 5
22 ---- 5
29 ---- 5
16 ---- 5
19 ---- 5
16 ---- 5
52 ---- 5
18 ---- 5
23 ---- 5
16 ---- 5
38 ---- 5
16 ---- 5
24 ---- 5
32 ---- 5
26 ---- 5
33 ---- 5
27 ---- 5
18 ---- 5
17 ---- 5
54 ---- 5
48 ---- 5
21 ---- 5
17 ---- 5
58 ---- 5
26 ---- 5
42 ---- 5
27 ---- 5
36 ---- 5
28 ---- 5
35 ---- 5
19 ---- 5
19 ---- 5
41 ---- 5
16 ---- 5
41 ---- 5
17 ---- 5
36 ---- 5
26 ---- 5
42 ---- 5
17 ---- 5
26 ---- 5
26 ---- 5
25 ---- 5
18 ---- 5
19 ---- 5
46 ---- 5
20 ---- 5
24 ---- 5
23 ---- 5


#### Results

In [100]:
hypothesis = df['Generated_Summary'].tolist()
reference = df['SUMMARY'].tolist()

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg','Best']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    scores = evaluator.get_scores(hypothesis, reference)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))
    print()
    


Evaluation with Avg
	rouge-1:	P: 52.49	R: 51.07	F1: 51.13
	rouge-2:	P: 35.90	R: 35.15	F1: 35.17
	rouge-3:	P: 32.63	R: 31.95	F1: 31.97
	rouge-4:	P: 31.11	R: 30.48	F1: 30.49

Evaluation with Best
	rouge-1:	P: 52.49	R: 51.07	F1: 51.13
	rouge-2:	P: 35.90	R: 35.15	F1: 35.17
	rouge-3:	P: 32.63	R: 31.95	F1: 31.97
	rouge-4:	P: 31.11	R: 30.48	F1: 30.49

