## Created By : Shreya Dubey
## Extractive headline predictor

In [20]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import nltk

In [21]:
def read_article(article):
    article = nltk.sent_tokenize(article)
    sentences = []

    for sentence in article:
        #print(sentence)
        sentences.append(sentence.replace("[^a-zA-Z]", " ").split(" "))
    sentences.pop() 
    
    return sentences

In [22]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

In [23]:
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

In [24]:
def generate_summary(name, top_n=1):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text anc split it
    sentences =  read_article(name)

    # Step 2 - Generate Similary Martix across sentences
    sentence_similarity_martix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity martix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_martix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    
    #print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    for i in range(top_n):
        if(len(ranked_sentence)!=0):
            summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Offcourse, output the summarize text
    summary = " ".join(summarize_text)
    return summary
    


In [36]:
# Loading data
import csv

def load_data(filename):
    with open(filename) as csvfile:
        readCSV = csv.reader(csvfile, delimiter=',')
        headlines = []
        summaries = []
        articles = []
        for row in readCSV:
            headline = row[0]
            summary = row[1]
            article = row[2]

            headlines.append(headline)
            summaries.append(summary)
            articles.append(article)

    headlines.remove(headlines[0])
    summaries.remove(summaries[0])
    articles.remove(articles[0])
    
    return headlines,articles,summaries

# loading data from news_summary_2.csv
headlines, articles, summaries = load_data('news_summary_2.csv')

In [37]:
print(len(headlines))
print(len(articles))

4516
4516


In [38]:
clean_article = []
clean_head = []

for i in range(len(articles)):
    if(articles[i]!=''and headlines[i]!=''):
        clean_article.append(articles[i])
        clean_head.append(headlines[i])


In [39]:
print(len(clean_article))
print(len(clean_head))

4397
4397


In [41]:
import pandas as pd
df=pd.DataFrame({'text':clean_article,'headline':clean_head})
export_csv = df.to_csv ('cleaned_input.csv', index = None, header=True)

In [42]:
# let's begin
predicted = []
original = []

for i in range(10):
    ans = generate_summary(clean_article[i],1)
    if(ans!=''):
        predicted.append(ans)
        original.append(clean_head[i])
        #print("Summarize Text: \n",ans)
        #print('\n')


In [43]:
#!pip install rouge
from rouge import Rouge

evaluator = Rouge()

metric = []
for i in range(len(predicted)):
    metric.append(evaluator.get_scores(predicted[i],original[i]))

avg_ro_1_r = 0
for i in range(len(metric)):
    avg_ro_1_r = avg_ro_1_r + metric[i][0]['rouge-1']['r']

(avg_ro_1_r/len(metric))*100

25.19079685746352

In [64]:
from nltk.translate.bleu_score import sentence_bleu
reference = original[0]
candidate = predicted[0]
score = evaluator.get_scores(candidate,reference)
print(score[0]['rouge-1']['r']*100)
print(score[0]['rouge-1']['p']*100)
#print(reference)
#print(candidate)

44.44444444444444
8.16326530612245
