In [1]:
import nltk

In [12]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import math
from itertools import product, count
from string import punctuation
from heapq import nlargest

In [5]:
stopwords = set(stopwords.words('english') + list(punctuation))
max_cut = 0.9
min_cut = 0.1

In [6]:
def compute_frequencies(word_sent):

    freq = defaultdict(int)

    for s in word_sent:
        for word in s:

            if word not in stopwords:
                freq[word] += 1

    m = float(max(freq.values()))
    
    for w in list(freq.keys()):
        freq[w] = freq[w]/m
        if freq[w] >= max_cut or freq[w] <= min_cut:
            del freq[w]
    
    return freq

In [7]:
def summarize(text, n):
    
    sents = sent_tokenize(text)
    assert n <= len(sents)

    word_sent = [word_tokenize(s.lower()) for s in sents]

    freq = compute_frequencies(word_sent)
    ranking = defaultdict(int)
    for i, word in enumerate(word_sent):
        for w in word:
            if w in freq:
                ranking[i] += freq[w]
    sents_idx = rank(ranking, n)
    return [sents[j] for j in sents_idx]

In [8]:
def rank(ranking, n):
    return nlargest(n, ranking, key=ranking.get)

In [9]:
with open("news.txt", "r") as myfile:
    text = myfile.read().replace('\n','')
res = summarize(text, 2)
for i in range(len(res)):
    print(res[i])

"Modern life is dramatically different to even 30 years ago," Prof Gray told Radio 4's Today programme, "people now drive to work and sit at work."
"The How Are You Quiz will help anyone who wants to take a few minutes to take stock and find out quickly where they can take a little action to make a big difference to their health."


In [13]:
stopwords = set(stopwords.words('english') + list(punctuation))

In [15]:
def calculate_similarity(sen1, sen2):

    counter = 0
    for word in sen1:
        if word in sen2:
            counter += 1
    return counter / (math.log(len(sen1)) + math.log(len(sen2)))

In [16]:
def create_graph(word_sent):
    num = len(word_sent)
    
    board = [[0.0 for _ in range(num)] for _ in range(num)]

    for i, j in product(range(num), repeat=2):
        if i != j:
            board[i][j] = calculate_similarity(word_sent[i], word_sent[j])
    return board

In [17]:
def weighted_pagerank(weight_graph):

    scores = [0.5 for _ in range(len(weight_graph))]
    old_scores = [0.0 for _ in range(len(weight_graph))]

    while different(scores, old_scores):
        for i in range(len(weight_graph)):
            old_scores[i] = scores[i]

        for i in range(len(weight_graph)):
            scores[i] = calculate_score(weight_graph, scores, i)
    return scores

def different(scores, old_scores):
    flag = False
    for i in range(len(scores)):
        if math.fabs(scores[i] - old_scores[i]) >= 0.0001:
            flag = True
            break
    return flag

def calculate_score(weight_graph, scores, i):
    length = len(weight_graph)
    d = 0.85
    added_score = 0.0

    for j in range(length):
        fraction = 0.0
        denominator = 0.0
        
        fraction = weight_graph[j][i] * scores[j]
        
        for k in range(length):
            denominator += weight_graph[j][k]
        added_score += fraction / denominator
    
    weighted_score = (1 - d) + d * added_score

    return weighted_score

In [19]:
def Summarize(text,n):
    
    sents = sent_tokenize(text)
    
    word_sent = [word_tokenize(s.lower()) for s in sents]

    for i in range(len(word_sent)):
        for word in word_sent[i]:
            if word in stopwords:
                word_sent[i].remove(word)
    similarity_graph = create_graph(word_sent)
    scores = weighted_pagerank(similarity_graph)
    sent_selected = nlargest(n, zip(scores, count()))
    sent_index = []
    for i in range(n):
        sent_index.append(sent_selected[i][1])
    return [sents[i] for i in sent_index]

In [20]:
with open("news.txt", "r") as myfile:
    text = myfile.read().replace('\n' , '')
    
print(Summarize(text, 2))

['The PHE website and app has a quiz that gives users a health score based on their lifestyle habits by asking questions such as, "Which snacks do you eat in a normal day?"', 'The campaign\'s clinical adviser, Prof Muir Gray, said it was about trying to make people have a different attitude to an "environmental problem".']
