In [None]:
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
print(nlp.pipe_names)
from extractor import BNPExtractor
import unicodedata

### Exact Match Evaluation

In [None]:
def normalize_text(text):
    """Normalize text by applying Unicode normalization, lowercasing, and stripping spaces."""
    return unicodedata.normalize('NFKC', text.lower().strip())

def calculate_metrics_exact_match(labels, preds):
    true_pos = 0
    gold_total = 0
    pred_total = 0
    precision, recall, f1 = 0, 0, 0
    
    # Loop over both manual labels (ground truth) and system predictions
    for golds, pred in zip(labels, preds):
        print(f'GOLDs: {golds}, PREDs: {pred}')
        
        # Convert to lowercase, normalize Unicode, and split by ", " to create sets
        golds_set = set(map(normalize_text, golds.split(", "))) if golds else set()
        pred_set = set(map(normalize_text, pred.split(", "))) if pred else set()
        
        gold_total += len(golds_set)
        pred_total += len(pred_set)
        
        # Initialize a counter for true positives
        num_gold_in_pred = 0
        
        # Check for matching values and print the gold values that are found in predictions
        for aspect in golds_set:
            if aspect in pred_set:
                num_gold_in_pred += 1
                print(f"Matched gold aspect: {aspect}")
            else:
                print(f"Unmatched gold aspect: {aspect}")
        
        true_pos += num_gold_in_pred
        print(f'num_gold_in_pred: {num_gold_in_pred}, gold len: {len(golds_set)}, pred len: {len(pred_set)}, true pos: {true_pos}')
    
    # Calculate precision, recall, and F1
    if true_pos:
        recall = 100 * (true_pos / gold_total)
        precision = 100 * (true_pos / pred_total) if pred_total > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {'precision': round(precision, 2), 'recall': round(recall, 2), 'f1': round(f1, 2)}

In [None]:
manual_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3_gpt35_pr/restaurant_reviews_split_ata_classified.xlsx"
manual_df = pd.read_excel(manual_file)
system_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3_gpt35_pr/system_abstractive_ata_restaurants_reviews_rag.xlsx"
system_df = pd.read_excel(system_file)

manual_labels = manual_df['manual_ata_extractive'].fillna('')
system_preds = system_df['atypical_aspects'].fillna('')

metrics = calculate_metrics_exact_match(manual_labels, system_preds)
print(f"Precision: {metrics['precision']}%")
print(f"Recall: {metrics['recall']}%")
print(f"F1 Score: {metrics['f1']}%")

In [None]:
# Export the metrics to an Excel file
export_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/exact_match_evaluation_metrics_rag.xlsx"
export_df = pd.DataFrame([metrics])
export_df.to_excel(export_file, index=False)

### Partial Match Evaluation

In [None]:
extractor = BNPExtractor()

In [None]:
def getBestMatchedPhrase(phrase_1_tokens, phrase_list):
    jaccard_sim_list = []
    for phrase_2_tokens in phrase_list:
        intersection = phrase_1_tokens.intersection(phrase_2_tokens)
        jaccard_sim = len(intersection) / len(phrase_1_tokens.union(phrase_2_tokens))
        jaccard_sim_list.append(jaccard_sim)
    if max(jaccard_sim_list) > 0:
        return phrase_list[jaccard_sim_list.index(max(jaccard_sim_list))]
    return ''

# Function to calculate precision, recall, and F1 score with partial matching logic
def calculate_metrics_exact_match_with_partial(preds, labels, tokenize=False):
    true_pos_e, true_pos_g = 0, 0
    false_pos = 0
    false_neg = 0
    precision, recall, f1 = 0, 0, 0
    
    # Loop over both manual labels (ground truth) and system predictions
    for golds, pred in zip(labels, preds):
        ep_list = []
        gp_list = []
        
        # Tokenize and clean up predictions and gold labels
        print(f"\nGOLDs: {golds} \nPREDs: {pred}")
        
        if pred:
            pred = pred.strip().split(", ")
            for ep in pred:
                # Use extractor.tokenize() for tokenization and normalize
                ep_list.append(set(normalize_text(token.text) for token in extractor.tokenize(ep)))
            print(f"Tokenized prediction phrases (ep_list): {ep_list}")
        
        if golds:
            gold_pos = set(golds.split(", "))
            for gp in gold_pos:
                # Use extractor.tokenize() for tokenization and normalize
                gp_list.append(set(normalize_text(token.text) for token in extractor.tokenize(gp)))
            print(f"Tokenized ground truth phrases (gp_list): {gp_list}")
        
        # Exact match case: no golds and no preds
        if not ep_list and not gp_list:
            true_pos_e += 1
            true_pos_g += 1
        
        # False positive: system predicts aspects but none exist in gold
        if ep_list and not gp_list:
            false_pos += 1
        
        # False negative: gold has aspects but none predicted
        if not ep_list and gp_list:
            false_neg += 1

        bestMatchedEP, bestMatchedGP = {}, {}
        
        # Find the best match for each gold phrase
        for gp_tokens in gp_list:
            if ep_list:
                bestMatchedEP[tuple(gp_tokens)] = getBestMatchedPhrase(gp_tokens, ep_list)
                print(f"Best matched predicted phrase for gold '{gp_tokens}': {bestMatchedEP[tuple(gp_tokens)]}")
        
        # Find the best match for each predicted phrase
        for ep_tokens in ep_list:
            if gp_list:
                bestMatchedGP[tuple(ep_tokens)] = getBestMatchedPhrase(ep_tokens, gp_list)
                print(f"Best matched gold phrase for predicted '{ep_tokens}': {bestMatchedGP[tuple(ep_tokens)]}")
        
        # Match gold to predicted, calculate partial match stats
        for gp_tokens in gp_list:
            if ep_list:
                matched_EP = bestMatchedEP[tuple(gp_tokens)]
                if matched_EP and bestMatchedGP[tuple(matched_EP)] == gp_tokens:
                    ep_tokens = matched_EP
                else:
                    ep_tokens = ''
                
                # Partial matching
                if ep_tokens:
                    print(f"Partial match found between: {ep_tokens} and {gp_tokens}")
                    true_pos_e += len(ep_tokens.intersection(gp_tokens)) / len(ep_tokens)
                    true_pos_g += len(ep_tokens.intersection(gp_tokens)) / len(gp_tokens)
                    false_pos += len(ep_tokens - gp_tokens) / len(ep_tokens)
                    false_neg += len(gp_tokens - ep_tokens) / len(gp_tokens)
                    ep_list.remove(ep_tokens)
                else:
                    print(f"No match found for gold phrase '{gp_tokens}'")
                    false_neg += 1
            else:
                false_neg += 1

        if ep_list:
            # No prediction to match with gold
            print(f"Unmatched prediction phrases remaining: {ep_list}")
            false_pos += len(ep_list)
    
    # Calculate precision, recall, and F1
    if true_pos_e or true_pos_g:
        precision = 100 * true_pos_e / (true_pos_e + false_pos)
        recall = 100 * true_pos_g / (true_pos_g + false_neg)
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"\nFinal counts: true_pos_e = {true_pos_e}, true_pos_g = {true_pos_g}, false_pos = {false_pos}, false_neg = {false_neg}")
    return {'precision': round(precision, 2), 'recall': round(recall, 2), 'f1': round(f1, 2)}

metrics_partial_m = calculate_metrics_exact_match_with_partial(system_preds, manual_labels)

# Print the metrics
print(f"Precision: {metrics_partial_m['precision']}%")
print(f"Recall: {metrics_partial_m['recall']}%")
print(f"F1 Score: {metrics_partial_m['f1']}%")

In [None]:
# Export the metrics to an Excel file
export_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/partial_match_evaluation_metrics_rag.xlsx"
export_df = pd.DataFrame([metrics_partial_m])
export_df.to_excel(export_file, index=False)