In [2]:
!pip install datasets

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [4]:
!pip install rouge

Collecting rouge
  Obtaining dependency information for rouge from https://files.pythonhosted.org/packages/32/7c/650ae86f92460e9e8ef969cc5008b24798dcf56a9a8947d04c78f550b3f5/rouge-1.0.1-py3-none-any.whl.metadata
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk.tokenize import sent_tokenize
import numpy as np
import networkx as nx
from rouge import Rouge
from datasets import load_dataset

# Function to read and tokenize text
def read_article(text):
    sentences = sent_tokenize(text)
    return [sentence.replace("[^a-zA-Z0-9]", " ") for sentence in sentences]

# Calculate cosine similarity between two sentences
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
    
    sent1 = [w.lower() for w in sent1 if w not in stopwords]
    sent2 = [w.lower() for w in sent2 if w not in stopwords]

    all_words = list(set(sent1 + sent2))
    vector1 = [sent1.count(w) for w in all_words]
    vector2 = [sent2.count(w) for w in all_words]
    
    return 1 - cosine_distance(vector1, vector2)

# Build similarity matrix for sentences
def build_similarity_matrix(sentences, stop_words):
    matrix = np.zeros((len(sentences), len(sentences)))
    
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                matrix[i][j] = sentence_similarity(sentences[i], sentences[j], stop_words)
    
    return matrix

# Generate summary based on top-ranked sentences
def generate_summary(text, top_n):
    stop_words = stopwords.words('english')
    sentences = read_article(text)
    similarity_matrix = build_similarity_matrix(sentences, stop_words)
    sentence_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(sentence_graph)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    summary = " ".join([ranked_sentences[i][1] for i in range(min(top_n, len(ranked_sentences)))])
    return summary, len(sentences)

# Evaluate summaries using ROUGE
def evaluate_summary(generated_summary, reference_summary):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summary, reference_summary)
    return scores

ModuleNotFoundError: No module named 'datasets'

In [None]:
dataset = load_dataset("percins/IN-ABS")
articles = dataset['train']

In [None]:
rouge_1_r = rouge_1_p = rouge_1_f = 0
rouge_2_r = rouge_2_p = rouge_2_f = 0
rouge_l_r = rouge_l_p = rouge_l_f = 0

In [None]:
for idx, article in enumerate(articles, start=1):
    text = article['text']
    ground_truth = article['summary']
    summary, _ = generate_summary(text, top_n=7)
    scores = evaluate_summary(summary, ground_truth)

    rouge_1_r += scores[0]['rouge-1']['r']
    rouge_1_p += scores[0]['rouge-1']['p']
    rouge_1_f += scores[0]['rouge-1']['f']
    
    rouge_2_r += scores[0]['rouge-2']['r']
    rouge_2_p += scores[0]['rouge-2']['p']
    rouge_2_f += scores[0]['rouge-2']['f']
    
    rouge_l_r += scores[0]['rouge-l']['r']
    rouge_l_p += scores[0]['rouge-l']['p']
    rouge_l_f += scores[0]['rouge-l']['f']

    if idx % 100 == 0:
        print(f"{idx} rows complete")

In [None]:
num_articles = len(articles)
print(num_articles)

In [None]:
print("Average ROUGE-1:", {'r': rouge_1_r / num_articles, 'p': rouge_1_p / num_articles, 'f': rouge_1_f / num_articles})
print("Average ROUGE-2:", {'r': rouge_2_r / num_articles, 'p': rouge_2_p / num_articles, 'f': rouge_2_f / num_articles})
print("Average ROUGE-L:", {'r': rouge_l_r / num_articles, 'p': rouge_l_p / num_articles, 'f': rouge_l_f / num_articles})
