In [6]:
# Install required libraries
!pip install nltk rouge-score pandas




In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
# Importing libraries after installation
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer  # Use the correct import for the ROUGE score
from collections import Counter

In [2]:
data = pd.read_csv('/content/synthetic_reviews.csv')

In [10]:
# Ensure the DataFrame has the expected columns
if 'synthetic_review' not in data.columns or 'original_prompt' not in data.columns:
    raise ValueError("CSV must contain 'synthetic_review' and 'original_prompt' columns.")


In [11]:
# Get the synthetic reviews and original prompts
synthetic_reviews = data['synthetic_review'].tolist()
original_prompts = data['original_prompt'].tolist()  # This could serve as reference if needed


In [12]:

# 1. BLEU Score Calculation
def calculate_bleu(synthetic_reviews, reference_reviews):
    bleu_scores = []
    for synth, ref in zip(synthetic_reviews, reference_reviews):
        # Tokenize the sentences
        reference = ref.split()
        candidate = synth.split()
        score = sentence_bleu([reference], candidate)
        bleu_scores.append(score)
    return sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0

# 2. ROUGE Score Calculation
def calculate_rouge(synthetic_reviews, reference_reviews):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, synth) for ref, synth in zip(reference_reviews, synthetic_reviews)]
    avg_scores = {key: sum(score[key].fmeasure for score in scores) / len(scores) for key in scores[0].keys()}
    return avg_scores

# 3. Diversity Measurement
def calculate_diversity(reviews):
    n_grams = Counter()
    for review in reviews:
        tokens = review.split()
        n_grams.update(Counter(tokens))
    distinct_n = len(n_grams)
    total_n = sum(n_grams.values())
    diversity_score = distinct_n / total_n if total_n > 0 else 0
    return diversity_score


In [13]:
# Evaluate synthetic reviews
bleu_score = calculate_bleu(synthetic_reviews, original_prompts)  # Using original prompts as reference
rouge_score = calculate_rouge(synthetic_reviews, original_prompts)  # Same here
diversity_score = calculate_diversity(synthetic_reviews)


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [14]:
# Print evaluation results
print(f"BLEU Score: {bleu_score:.4f}")
print(f"ROUGE Score: {rouge_score}")
print(f"Diversity Score: {diversity_score:.4f}")

BLEU Score: 0.4494
ROUGE Score: {'rouge1': 0.6236275041239976, 'rouge2': 0.6033162948343247, 'rougeL': 0.6236275041239976}
Diversity Score: 0.4255
