In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from rouge_score import rouge_scorer
import pandas as pd

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')  # For lemmatization

def solve(text, threshold=1.2):
    # Initialize stopwords, lemmatizer, and tokenizer
    stopwords_set = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize text into words
    words = word_tokenize(text)
    
    # Lemmatize words and filter out stopwords
    lemmatized_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stopwords_set and word.isalnum()]
    
    # Create a frequency table for lemmatized words
    freq_table = {word: lemmatized_words.count(word) for word in lemmatized_words}
    
    # Tokenize text into sentences
    sentences = sent_tokenize(text)
    
    # Create a sentence value dictionary to score sentences
    sentence_value = {}
    for sentence in sentences:
        for word, freq in freq_table.items():
            if word in sentence.lower():
                sentence_value[sentence] = sentence_value.get(sentence, 0) + freq
    
    # Calculate the sum of sentence values
    sum_values = sum(sentence_value.values())
    
    # Calculate the average sentence value
    average = int(sum_values / len(sentence_value)) if len(sentence_value) > 0 else 0
    
    # Generate the summary based on the threshold
    summary = ' '.join([sentence for sentence in sentences if sentence_value.get(sentence, 0) > (threshold * average)])
    
    return summary

# Function to compute ROUGE scores
def compute_rouge_scores(reference, summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, summary)
    return scores

# Function to evaluate the model on the text with ROUGE metrics
def evaluate_rouge(text, reference_summary, model_function, threshold=1.2):
    generated_summary = model_function(text, threshold)
    rouge_scores = compute_rouge_scores(reference_summary, generated_summary)
    
    # Display ROUGE scores
    print(f"Generated Summary:\n{generated_summary}\n")
    print(f"ROUGE-1: {rouge_scores['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-2: {rouge_scores['rouge2'].fmeasure:.4f}")
    print(f"ROUGE-L: {rouge_scores['rougeL'].fmeasure:.4f}")
    return rouge_scores

# Example usage
text = '''
Samsung was founded by Lee Byung-chul in 1938 as a trading company. Over the next three decades, the group diversified into areas including food processing, textiles, insurance, securities, and retail. Samsung entered the electronics industry in the late 1960s and the construction and shipbuilding industries in the mid-1970s; these areas would drive its subsequent growth. Following Lee's death in 1987, Samsung was separated into five business groups – Samsung Group, Shinsegae Group, CJ Group, Hansol Group, and JoongAng Group.
'''

# Assume this is the reference summary provided
reference_summary = 'Samsung was founded in 1938 as a trading company. Samsung entered electronics in the 1960s and shipbuilding in the 1970s. Following Lee\'s death, Samsung was separated into five business groups.'

# Evaluate model and print ROUGE scores
print('Original Text:')
print(text)

print("\nSummary of Text:\n")
evaluate_rouge(text, reference_summary, solve, threshold=1.2)


Original Text:

Samsung was founded by Lee Byung-chul in 1938 as a trading company. Over the next three decades, the group diversified into areas including food processing, textiles, insurance, securities, and retail. Samsung entered the electronics industry in the late 1960s and the construction and shipbuilding industries in the mid-1970s; these areas would drive its subsequent growth. Following Lee's death in 1987, Samsung was separated into five business groups – Samsung Group, Shinsegae Group, CJ Group, Hansol Group, and JoongAng Group.


Summary of Text:

Generated Summary:
Following Lee's death in 1987, Samsung was separated into five business groups – Samsung Group, Shinsegae Group, CJ Group, Hansol Group, and JoongAng Group.

ROUGE-1: 0.5091
ROUGE-2: 0.3396
ROUGE-L: 0.4000


[nltk_data] Downloading package punkt to /home/sameer/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sameer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sameer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'rouge1': Score(precision=0.5833333333333334, recall=0.45161290322580644, fmeasure=0.509090909090909),
 'rouge2': Score(precision=0.391304347826087, recall=0.3, fmeasure=0.33962264150943394),
 'rougeL': Score(precision=0.4583333333333333, recall=0.3548387096774194, fmeasure=0.39999999999999997)}