In [1]:
import networkx as nx
import numpy as np

from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer


def textrank(document):
    s_tokenizer = PunktSentenceTokenizer()
    sentences = s_tokenizer.tokenize(document)

    # build a Bag-of-words matrix
    bow_matrix = CountVectorizer().fit_transform(sentences)
    normalized = TfidfTransformer().fit_transform(bow_matrix)

    similarity_graph = normalized * normalized.T

    # from similarity graph, apply PageRank and sort the results based on rank
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph)
    sentence_array = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    sentence_array = np.asarray(sentence_array)

    fmax = float(sentence_array[0][0])
    fmin = float(sentence_array[len(sentence_array) - 1][0])

    temp_array = []
    # Normalization
    for i in range(0, len(sentence_array)):
        if fmax - fmin == 0:
            temp_array.append(0)
        else:
            temp_array.append((float(sentence_array[i][0]) - fmin) / (fmax - fmin))

    # threshold to select only those sentences having more rank
    threshold = (sum(temp_array) / len(temp_array)) + 0.2

    sentence_list = []

    for i in range(0, len(temp_array)):
        if temp_array[i] > threshold:
            sentence_list.append(sentence_array[i][1])

    seq_list = []
    for sentence in sentences:
        if sentence in sentence_list:
            seq_list.append(sentence)

    return seq_list

In [2]:
pip install sumy

In [3]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
import pandas as pd

summaries1 = [] # for TextRank
summaries2 = [] # for LexRank
summaries3 = [] # for LSA
summarizer = LexRankSummarizer() # creating lexrank object()
sumlsa = LsaSummarizer() # creatiing LSA object()

df = pd.read_csv('../input/bbcfull/BBC News Train.csv')
for _, row in df.iterrows():
    data = row['Text']
    
    # TextRank approach
    summ1 = textrank(data)
    summaries1.append(summ1)
    
    parser = PlaintextParser.from_string(data, Tokenizer("english"))


    # LexRank approach
    sum_lexrank = summarizer(parser.document, 3) #Summarizing the document
    summ2 = ''
    for s in sum_lexrank:
        summ2 += ' ' + str(s)

    cleaned_summ2 = summ2.strip()
    summaries2.append(cleaned_summ2)
    
    # LSA approach
    sum_lsa = sumlsa(parser.document, 3) #Summarizing the document
    summ3 = ''
    for s in sum_lsa:
        summ3 += ' ' + str(s)

    cleaned_summ3 = summ3.strip()
    summaries3.append(cleaned_summ3)
    
    
df.insert(2, 'Summary1', summaries1)
df.insert(2, 'Summary2', summaries2)
df.insert(2, 'Summary3', summaries3)






In [12]:
# save the dataframe to a CSV
df.to_csv(r'./output_summary3.csv', index=False)

In [4]:
pip install rouge_score

In [5]:
import pandas as pd
from rouge_score import rouge_scorer

# useing ROUGE-1 and ROUGE-L scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

df = pd.read_csv('../input/ddptext/output_summary3.csv')
df = df.head(10)
r1_1 = 0.0
rl_1 = 0.0
r1_1_1 = 0.0
rl_1_1 = 0.0
r1_1_2 = 0.0
rl_1_2 = 0.0
r1_2 = 0.0
rl_2 = 0.0
r1_2_1 = 0.0
rl_2_1 = 0.0
r1_2_2 = 0.0
rl_2_2 = 0.0
r1_3 = 0.0
rl_3 = 0.0
r1_3_1 = 0.0
rl_3_1 = 0.0
r1_3_2 = 0.0
rl_3_2 = 0.0

for _, row in df.iterrows():
    summary1 = row['Summary1']
    summary2 = row['Summary2']
    summary3 = row['Summary3']
    
    ref1 = row['REF1']
    ref2 = row['REF2']
    
    scores = scorer.score(ref1 + ' ' + ref2 , summary1)
    r1_1 += scores.get('rouge1').precision
    rl_1 += scores.get('rougeL').precision
    r1_1_1 += scores.get('rouge1').recall
    rl_1_1 += scores.get('rougeL').recall
    r1_1_2 += scores.get('rouge1').fmeasure
    rl_1_2 += scores.get('rougeL').fmeasure
    
    scores = scorer.score(ref1 + ' ' + ref2 , summary2)
    r1_2 += scores.get('rouge1').precision
    rl_2 += scores.get('rougeL').precision
    r1_2_1 += scores.get('rouge1').recall
    rl_2_1 += scores.get('rougeL').recall
    r1_2_2 += scores.get('rouge1').fmeasure
    rl_2_2 += scores.get('rougeL').fmeasure
    
    scores = scorer.score(ref1 + ' ' + ref2 , summary3)
    r1_3 += scores.get('rouge1').precision
    rl_3 += scores.get('rougeL').precision
    r1_3_1 += scores.get('rouge1').recall
    rl_3_1 += scores.get('rougeL').recall
    r1_3_2 += scores.get('rouge1').fmeasure
    rl_3_2 += scores.get('rougeL').fmeasure

print('Rouge-1 (TextRank) [ precision', "{:.2f}".format(r1_1/10), ', recall', "{:.2f}".format(r1_1_1/10), ', fmeasure', "{:.2f}".format(r1_1_2/10),"]")
print('Rouge-L (TextRank) [ precision', "{:.2f}".format(rl_1/10), ', recall', "{:.2f}".format(rl_1_1/10), ', fmeasure', "{:.2f}".format(rl_1_2/10),"]")
print('Rouge-1 (LexRank)  [ precision', "{:.2f}".format(r1_2/10), ', recall', "{:.2f}".format(r1_2_1/10), ', fmeasure', "{:.2f}".format(r1_2_2/10),"]")
print('Rouge-L (LexRank)  [ precision', "{:.2f}".format(rl_2/10), ', recall', "{:.2f}".format(rl_2_1/10), ', fmeasure', "{:.2f}".format(rl_2_2/10),"]")
print('Rouge-1 (LSA)      [ precision', "{:.2f}".format(r1_3/10), ', recall', "{:.2f}".format(r1_3_1/10), ', fmeasure', "{:.2f}".format(r1_3_2/10),"]")
print('Rouge-L (LSA)      [ precision', "{:.2f}".format(rl_3/10), ', recall', "{:.2f}".format(rl_3_1/10), ', fmeasure', "{:.2f}".format(rl_3_2/10),"]")
