In [None]:
import nltk
import pandas as pd
from nltk import pos_tag
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 
from rouge import Rouge

nltk.download('averaged_perceptron_tagger')
rouge = Rouge()
# Pre-process the input text to extract part of speech information
def get_pos_tags(text):
    words = nltk.word_tokenize(text)
    pos_tags = pos_tag(words)
    return pos_tags
# Add the part of speech information as additional input to the model
def add_pos_tags_to_input(input_text, pos_tags):
    pos_tag_map = {"NN": "<NN>", "JJ": "<JJ>", "VB": "<VB>"}
    pos_tagged_text = ""
    for word, pos in zip(nltk.word_tokenize(input_text), pos_tags):
        if pos in pos_tag_map:
            pos_tagged_text += word + " " + pos_tag_map[pos] + " "
        else:
            pos_tagged_text += word + " "
    return pos_tagged_text

# Tokenize the input text
tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")
model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6")

# Load the summaries.csv file into a dataframe
df = pd.read_csv('new_summaries.csv')
# Loop over the rows in the dataframe
for index, row in df.iterrows():
    text = row['Text']
    pos_tags = get_pos_tags(text)
    pos_tagged_text = add_pos_tags_to_input(text, pos_tags)
    input_tokenized = tokenizer.encode(pos_tagged_text, return_tensors='pt',max_length=1024,truncation=True)
    summary_ids = model.generate(input_tokenized,
                                num_beams=9,
                                no_repeat_ngram_size=3,
                                length_penalty=2.0,
                                min_length=150,
                                max_length=500,
                                early_stopping=True)
    summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
    # Update the 'Predicted Summary' column in the dataframe with the generated summary
    df.at[index, 'Predicted Summary'] = summary
    original_summary = row['Original Summary']
    # Calculate the Rouge score
    scores = rouge.get_scores(summary, original_summary)
    # Add the Rouge R1, R2, and RL scores to the respective columns in the csv
    df.at[index, 'R1'] = scores[0]['rouge-1']['f']
    df.at[index, 'R2'] = scores[0]['rouge-2']['f']
    df.at[index, 'Rl'] = scores[0]['rouge-l']['f']    
df.to_csv('Output_Pegasus_With_POS_TAG_100doc.csv', index=False)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Pranav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Downloading (â€¦)"pytorch_model.bin";:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

# Rouge Score

In [None]:

# Calculate the average scores
r1_avg = df['R1'].mean()
r2_avg = df['R2'].mean()
rl_avg = df['Rl'].mean()

# Print the average scores
print(f"Average R1 Score: {r1_avg}")
print(f"Average R2 Score: {r2_avg}")
print(f"Average Rl Score: {rl_avg}")
df.to_csv("Output_Pegasus_With_POS_TAG_100doc.csv", index=False)

# Precision, Recall and F1 Score

In [None]:
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.metrics import precision, recall, f_measure

df = pd.read_csv('Output_Pegasus_With_POS_TAG_100doc.csv')

for index, row in df.iterrows():
    predicted_summary = row['Predicted Summary']
    original_summary = row['Original Summary']
    
    predicted_tokens = set(word_tokenize(predicted_summary))
    original_tokens = set(word_tokenize(original_summary))

    precision_score = precision(original_tokens, predicted_tokens)
    recall_score = recall(original_tokens, predicted_tokens)
    f1_score = f_measure(original_tokens, predicted_tokens)
    
    df.at[index, 'Precision'] = precision_score
    df.at[index, 'Recall'] = recall_score
    df.at[index, 'F1 Score'] = f1_score

df.to_csv('Output_Pegasus_With_POS_TAG_100doc.csv', index=False)


In [None]:
import pandas as pd

df = pd.read_csv('Output_Pegasus_With_POS_TAG_100doc.csv')

precision_mean = df['Precision'].mean()
recall_mean = df['Recall'].mean()
f1_score_mean = df['F1 Score'].mean()

print("Precision mean:", precision_mean)
print("Recall mean:", recall_mean)
print("F1 Score mean:", f1_score_mean)


# BLEU Score

In [None]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu

df = pd.read_csv('Output_Pegasus_With_POS_TAG_100doc.csv')

for index, row in df.iterrows():
    predicted_summary = row['Predicted Summary']
    original_summary = row['Original Summary']
    
    predicted_tokens = word_tokenize(predicted_summary)
    original_tokens = word_tokenize(original_summary)

    bleu_score = sentence_bleu([original_tokens], predicted_tokens)
    
    df.at[index, 'BLEU Score'] = bleu_score

df.to_csv('Output_Pegasus_With_POS_TAG_100doc.csv', index=False)


import pandas as pd

df = pd.read_csv('Output_Pegasus_With_POS_TAG_100doc.csv')

bleu_score_mean = df['BLEU Score'].mean()

print("BLEU Score mean:", bleu_score_mean)
