### Importing Libraries

In [138]:
import networkx
import rouge
import nltk
import numpy as np
import pandas as pd
from sklearn import preprocessing
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
os.chdir('../Preprocessing')
from normalization import normalize_corpus, parse_content

### Reading the Data

In [139]:
df = pd.read_csv('../Data_collection/dataset.csv')
df.head()

Unnamed: 0,CATEGORY,CONTENT,SUMMARY
0,business,The Federal Reserve approved Ally Financial In...,The Federal Reserve approved Ally Financial In...
1,business,— Major shareholders of Duke Energy Corp. have...,— Major shareholders of Duke Energy Corp. have...
2,business,Photos taken earlier this month show that Nort...,Photos taken earlier this month show that Nort...
3,business,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...
4,business,The energy giant says it is committed to clean...,The energy giant says it is committed to clean...


In [55]:
df['FILTERED_CONTENT'] = df['CONTENT'].apply(parse_content)
df['length'] =df['FILTERED_CONTENT'].apply(len)
df = df[df['length']>15]
df.drop('length',axis =1 ,inplace =True)

In [56]:
df_copy = df

In [96]:
#Taking a subset of the data:
df=df_copy.iloc[1:10]

df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 3 to 21
Data columns (total 4 columns):
CATEGORY            9 non-null object
CONTENT             9 non-null object
SUMMARY             9 non-null object
FILTERED_CONTENT    9 non-null object
dtypes: object(4)
memory usage: 360.0+ bytes


In [97]:
df['FILTERED_CONTENT'] = df['FILTERED_CONTENT'].apply(normalize_corpus)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,CATEGORY,CONTENT,SUMMARY,FILTERED_CONTENT
3,business,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...,[thanks dog report associate press know active...
6,business,"RALEIGH, N.C., March 26 (Reuters) - Duke Energ...",In a letter to the state’s utilities commissio...,[raleigh march reuters duke energy corp say we...
9,business,"CHARLOTTE, N.C., March 26, 2014 /PRNewswire/ -...","Throughout the past few decades, we have dedic...",[charlotte march duke energy nyse duk today is...
11,business,By Suttinee Yuvejwattana and Michael Sin\n\nMa...,The Japanese satellite detected about a dozen ...,"[suttinee yuvejwattana michael sin, march bloo..."
12,business,"PERTH, Australia (AP) � Planes and ships searc...","PERTH, Australia (AP) � Planes and ships searc...",[perth australia ap plane ship search debris s...


### Text Rank Algorithm

In [76]:
def textrank_text_summarizer(data,num_sentences=5):
    
    text =data[3]
    sentences = [sent for sent in nltk.sent_tokenize(data[1])]
    num_sentences = 5
    tfidvectorizer = TfidfVectorizer(
                                 ngram_range=(1,5),
                                 smooth_idf=True,
                                 use_idf=True)
    dt_matrix = tfidvectorizer.fit_transform(text)

    similarity_matrix = (dt_matrix * dt_matrix.T)
    similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
    
    scores = networkx.pagerank(similarity_graph)   
    ranked_sentences = sorted(((score, index) 
                                for index, score 
                                in scores.items()), 
                              reverse=True)
    try:
        top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)]
        top_sentence_indices.sort()
        top_sentences = [sentences[index] for index in top_sentence_indices]
        summary =''.join(top_sentences)
        return summary
    except IndexError:
        pass
    


In [77]:
df['Generated_Summary']= df.apply(textrank_text_summarizer,axis =1)
df.dropna(subset = ['Generated_Summary'],inplace=True)
df.head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,CATEGORY,CONTENT,SUMMARY,FILTERED_CONTENT,Generated_Summary
3,business,Thanks to dogged reporting by the Associated P...,Thanks to dogged reporting by the Associated P...,[thanks dog report associate press know active...,Thanks to dogged reporting by the Associated P...
6,business,"RALEIGH, N.C., March 26 (Reuters) - Duke Energ...",In a letter to the state’s utilities commissio...,[raleigh march reuters duke energy corp say we...,"RALEIGH, N.C., March 26 (Reuters) - Duke Energ..."
9,business,"CHARLOTTE, N.C., March 26, 2014 /PRNewswire/ -...","Throughout the past few decades, we have dedic...",[charlotte march duke energy nyse duk today is...,We are committed to working with the state of ...
12,business,"PERTH, Australia (AP) � Planes and ships searc...","PERTH, Australia (AP) � Planes and ships searc...",[perth australia ap plane ship search debris s...,"PERTH, Australia (AP) � Planes and ships searc..."
13,business,"PERTH, Australia: Thunderstorms and gale-force...","PERTH, Australia: Thunderstorms and gale-force...",[perth australia thunderstorm wind ground inte...,"PERTH, Australia: Thunderstorms and gale-force..."
19,business,He said the information had been given to Mala...,RAAF ground crew stand on the apron after an A...,"[say information give malaysia, raaf ground cr...","Meanwhile, planes searching for the missing Ma..."
20,business,Hints about the lost Malaysian jetliner piled ...,Bad weather cut short the hunt for possible de...,[hint lose malaysian jetliner pile thursday pr...,Hints about the lost Malaysian jetliner piled ...
21,business,Bangkok/Tokyo/Canberra: Over 300 new objects w...,"According to a report from Tokyo, a Japanese s...",[new object spotted satellite thailand japan n...,Bangkok/Tokyo/Canberra: Over 300 new objects w...
25,business,Australian officials coordinating internationa...,Australian officials coordinating internationa...,[australian official coordinate international ...,Australian officials coordinating internationa...
26,business,The Thailand Earth Observation Satellite (Thai...,The Thailand Earth Observation Satellite (Thai...,[thailand earth observation satellite thaichot...,The Thailand Earth Observation Satellite (Thai...


In [11]:
print("System Generated Summary:\n")
print(df.iloc[2]['SUMMARY'])
print("\nSummary\n")
print(df.iloc[2]['Generated_Summary'])

System Generated Summary:

The Japanese satellite detected about a dozen pieces of possible debris in a March 26 image, Kyodo News Service said.
Satellite sightings have provided a new focus in the multination search to find the Malaysian Airline System Bhd.
AMSA initially said that ships were also leaving the search zone, before saying they would try to continue the hunt.
The Airbus scans showed objects as long as 23 meters scattered over a 400-square kilometer area of the ocean, Malaysian officials said yesterday.
“We cannot tell whether the potential objects are from MH370,” Malaysia’s Acting Transport Minister Hishammuddin Hussein said in Kuala Lumpur yesterday.

Summary

The Thai photos from March 24 show objects spanning 2 meters to 15 meters floating about 2,700 kilometers (1,680 miles) southwest of Perth, said Anond Snidvongs, executive director of the Geo-Informatics & Space Technology Development Agency.The Japanese satellite detected about a dozen pieces of possible debris i

#### Results

In [137]:
hypothesis = df['Generated_Summary'].tolist()
reference = df['SUMMARY'].tolist()

def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)


for aggregator in ['Avg','Best']:
    print('Evaluation with {}'.format(aggregator))
    apply_avg = aggregator == 'Avg'
    apply_best = aggregator == 'Best'

    evaluator = rouge.Rouge(metrics=['rouge-n'],
                           max_n=4,
                           limit_length=True,
                           length_limit=100,
                           length_limit_type='words',
                           apply_avg=apply_avg,
                           apply_best=apply_best,
                           alpha=0.5, # Default F1_score
                           weight_factor=1.2,
                           stemming=True)

    scores = evaluator.get_scores(hypothesis, reference)
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print(prepare_results(results['p'], results['r'], results['f']))
    print()
    


KeyError: 'Generated_Summary'