<h2>Import Dependencies</h2>

In [9]:
import urllib
from bs4 import BeautifulSoup as bs
import nltk
import pandas  as pd
import numpy as np
from nltk.stem.snowball import SnowballStemmer
from nltk import ngrams
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd


<h2> Parse and clean data </h2>

In [10]:
def clean_data(html,baseurl):
    soup = bs(html, 'html.parser')
    if baseurl == 'i':
        entries = soup.find_all('div',{'class':'description'})
    elif baseurl == 't':
        entries = soup.find_all('div',{'itemprop':'articleBody'})
    elif baseurl == 'h':
        entries = soup.find_all('div',{'itemprop':'articlebody'})
    for each in entries:
            if each.figure:
                each.figure.decompose()
    content = []
    for e in entries:
         content.extend(e.find_all("p"))
    
    text = ""
    for each in content:
        text = text + each.get_text() +" "

    text = text.encode('utf-8').decode("unicode_escape").encode('ascii','ignore')
    text = nltk.sent_tokenize(text)    
    return text

<h2> Extraction </h2>

In [11]:
def word_scores(text):
    # tokenize the article
    tokenizer = RegexpTokenizer(r'\w+')
    word_list=[]
    for t in text:
        word_list.extend(tokenizer.tokenize(t))


    # remove stop words from the article
    filtered_words = [word for word in word_list if word.lower() not in stopwords.words('english')]
    total_words = len(filtered_words)
    words = filtered_words

    # find n-gram probability for filtered words
    filtered_words  = []
    for each in Counter(words).items():
        filtered_words.append([each[0] , float(each[1])/float(total_words)])

    # identify important words
    important_words = []
    for item in filtered_words:
        if item[1]>0.003:
            important_words.append(item[0])

    while '. ' in text:
        text = text.replace('. ','\n') 

    while "\n" in text:
        text = text.split("\n")
        
    vect = TfidfVectorizer(stop_words='english')
    dtm = vect.fit_transform(text)
    features = vect.get_feature_names()
    
    scores = zip(vect.get_feature_names(),
                 np.asarray(dtm.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    sorted_scores = dict(sorted_scores)
    return sorted_scores

<h2> Sentence Score </h2>

In [12]:
def sentence_score(text,sorted_scores):
    sentence_score = []
    for each in text:
        score = 0.0
    #     each = each.
        for word in each.split(" "):
            #print word
            word = word.lower()
            if word in sorted_scores:
    #             print "Hello"
                score += sorted_scores[word]
    #             print word_scores[word]
        sentence_score.append([each,score])    
    return sentence_score

<h2> Calculate Rouge </h2>

In [13]:
def rougeN(gold_summary,machine_summary,n):
    stemmer = SnowballStemmer("english")

    gold_summary_slammed = ""
    for word in gold_summary.split():
        gold_summary_slammed = gold_summary_slammed+stemmer.stem(word)+" "


    machine_summary_slammed = ""
    for word in machine_summary.split():
        machine_summary_slammed = machine_summary_slammed+stemmer.stem(word)+" "

    n_gold = ngrams(gold_summary_slammed.split(" "),n)
    n_machine = ngrams(machine_summary_slammed.split(" "),n)

    gold_list = []

    for gram in n_gold:
        gold_list.append(gram)

    machine_list = []
    for gram in n_machine:
        machine_list.append(gram)

    return float(len(list(set(gold_list).intersection(machine_list))))/float(len(list(set(gold_list))))

<h2> Find Summary </h2>

In [14]:
# def summary(SUMMARY_COMP_FACT,sentences):
#     scores = []
#     top_sentences = sorted(dict(sentences).items(), key=lambda x: x[1], reverse=True)[:(len(sentences)/SUMMARY_COMP_FACT)]
#     machine_summary = ""
#     for each in top_sentences:
#         machine_summary = machine_summary + each[0]
#     scores.append([SUMMARY_COMP_FACT, len(top_sentences), rougeN(gold_summary,machine_summary,2), rougeN(gold_summary,machine_summary,1)])

In [15]:
def summarize(sentences):
    paired_sens = {}
    num_sen = len(sentences)/6

    for pair in enumerate(sentences):
         paired_sens[pair[0]]=pair[1][1]

    temp = sorted((paired_sens).items(), key=lambda x: x[1], reverse=True)[:int(num_sen)]
    temp = sorted(dict(temp).items(), key=lambda x: x[0], reverse=False)[:int(num_sen)]
    machine_summary = ""
    for i in temp:
#         print sentences[i[0]][0],"\n"
        machine_summary = machine_summary +  sentences[i[0]][0] + "\n"
    return machine_summary

<h2> Main </h2>

In [38]:
df = pd.read_csv('news_summary.csv')
result = []
for i in range(0,1):
    print i
    try:
        url = df['read_more'][i]
        gold_summary = df['text'][i]
        if "indiatoday" in url or "intoday" in url:
            baseurl = "i"
        elif "hindustantimes" in url:
            baseurl = "h"
        elif "theguardian" in url:
            baseurl = "t"

        file = urllib.urlopen(url)
        html = file.read()

        text = clean_data(html,baseurl)
        print text
        ws = word_scores(text)

        ss = sentence_score(text,ws)

        machine_summary = summarize(ss)
        print machine_summary
        result.append([i, rougeN(gold_summary,machine_summary,2),rougeN(gold_summary,machine_summary,1)])
    except UnicodeDecodeError:
        print "UnicodeDecodeError"
        

0
['The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media.', 'The union territorys administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace.', 'It has been decided to celebrate the festival of Rakshabandhan on August 7.', 'In this connection, all offices/ departments shall remain open and celebrate the festival collectively at a suitable time wherein all the lady staff shall tie rakhis to their colleagues, the order, issued on August 1 by Gurpreet Singh, deputy secretary (personnel), had said.', 'To ensure that no one skipped office, an attendance report was to be sent to the government the next evening.', 'The circular was withdrawn through a one-line order issued late in the evening by the UTs department of personnel and administrati

In [36]:
r = result
np.mean(np.array(r), axis=0)[1]

0.22033898305084745

In [51]:
print gold_summary
machine_summary = 'The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media. The union territorys administration was forced to retreat within 24 hours of issuing the circular that made it compulsory for its staff to celebrate Rakshabandhan at workplace. It has been decided to celebrate the festival of Rakshabandhan on August 7. '
machine_summary1='The Daman and Diu administration on Wednesday withdrew a circular that asked women staff to tie rakhis on male colleagues after the order triggered a backlash from employees and was ripped apart on social media. Rakshabandhan, a celebration of the bond between brothers and sisters, is one of several Hindu festivities and rituals that are no longer confined of private, family affairs but have become tools to push politic al ideologies. In 2014, the year BJP stormed to power at the Centre, Rashtriya Swayamsevak Sangh (RSS) chief Mohan Bhagwat said the festival had national significance and should be celebrated widely to protect Hindu culture and live by the values enshrined in it. '
rougeN(gold_summary,machine_summary,2)

The Administration of Union Territory Daman and Diu has revoked its order that made it compulsory for women to tie rakhis to their male colleagues on the occasion of Rakshabandhan on August 7. The administration was forced to withdraw the decision within 24 hours of issuing the circular after it received flak from employees and was slammed on social media.


0.4745762711864407