In [1]:
# ============================================================================
# EVAL CELL 1: SETUP & LOAD SAVED PREDICTIONS
# ============================================================================
!pip install -q evaluate rouge_score bert_score sentence-transformers
import json
import re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder
from sklearn.metrics.pairwise import cosine_similarity
import evaluate

# 1. Install/Update dependencies


# 2. Environment Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Setup Complete | Device: {device}")

# 3. Load your JSONL file
# Note: Ensure the file is uploaded to your current working directory
PRED_FILE = "test_predictions.jsonl"
test_predictions = []

try:
    with open(PRED_FILE, 'r', encoding='utf-8') as f:
        for line in f:
            test_predictions.append(json.loads(line))
    print(f"✅ Loaded {len(test_predictions)} samples from {PRED_FILE}")
except FileNotFoundError:
    print(f"❌ Error: {PRED_FILE} not found. Please upload it to the current folder.")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
✅ Setup Complete | Device: cuda
✅ Loaded 453 samples from test_predictions.jsonl


In [4]:
# ============================================================================
# EVAL CELL 2: THE ROBUST BASELINE PARSER & HELPERS
# ============================================================================
def extract_text_from_prompt(prompt_input):
    """Extracts social media text from the baseline prompt."""
    matches = list(re.finditer(r'Text:\s*(.+?)(?=\s*Output:|$)', prompt_input, re.DOTALL))
    return matches[-1].group(1).strip() if matches else "[Could not extract]"

def normalize_val(s):
    return re.sub(r'\s+', ' ', s.strip().lower())

def parse_7field_robust(text):
    """Parses output into a list of field dictionaries, ignoring truncation."""
    results = []
    if not text or not isinstance(text, str): return results

    entries = re.split(r'\s*;\s*', text.strip())
    for entry in entries:
        # We slice [:7] to handle any extra pipes the model might generate
        fields = [f.strip() for f in entry.split('|')][:7]

        # We only process if we have at least aspect and sentiment
        if len(fields) >= 2:
            sent = normalize_val(fields[1])
            if 'pos' in sent: sent = 'positive'
            elif 'neg' in sent: sent = 'negative'
            elif 'neu' in sent: sent = 'neutral'

            results.append({
                "aspect": normalize_val(fields[0]),
                "sentiment": sent,
                "field_count": len(fields),
                "fields": fields
            })
    return results

print("✅ Robust baseline parser initialized.")

✅ Robust baseline parser initialized.


In [5]:
# ============================================================================
# EVAL CELL 3: BASELINE STRUCTURAL & TASK METRICS
# ============================================================================
tp = fp = fn = tp_asp = fp_asp = fn_asp = 0
perfect_format_count = 0
all_field_counts = []

for item in test_predictions:
    gold_data = parse_7field_robust(item['gold_target'])
    pred_data = parse_7field_robust(item['gen_output'])

    # 1. Structural Check
    is_sample_perfect = True
    if not pred_data: is_sample_perfect = False
    for entry in pred_data:
        all_field_counts.append(entry['field_count'])
        if entry['field_count'] != 7: is_sample_perfect = False
    if is_sample_perfect: perfect_format_count += 1

    # 2. Classification Check (F1)
    gold_set = {(p['aspect'], p['sentiment']) for p in gold_data}
    pred_set = {(p['aspect'], p['sentiment']) for p in pred_data}
    gold_asp = {p['aspect'] for p in gold_data}
    pred_asp = {p['aspect'] for p in pred_data}

    tp += len(gold_set & pred_set)
    fp += len(pred_set - gold_set)
    fn += len(gold_set - pred_set)
    tp_asp += len(gold_asp & pred_asp)
    fp_asp += len(pred_asp - gold_asp)
    fn_asp += len(gold_asp - pred_asp)

def get_prf(t, f_p, f_n):
    p = t / (t + f_p) if (t + f_p) > 0 else 0
    r = t / (t + f_n) if (t + f_n) > 0 else 0
    f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0
    return p, r, f1

p_tot, r_tot, f1_tot = get_prf(tp, fp, fn)
p_asp, r_asp, f1_asp = get_prf(tp_asp, fp_asp, fn_asp)

print("="*70)
print("BASELINE PERFORMANCE REPORT")
print("="*70)
print(f"Perfect 7-Field Ratio: {perfect_format_count/len(test_predictions):.2%}")
print(f"Avg Fields/Entry:      {np.mean(all_field_counts):.2f} / 7.00")
print("-" * 70)
print(f"Aspect + Sentiment F1: {f1_tot:.4f}")
print(f"Aspect-Only F1:        {f1_asp:.4f}")
print("="*70)

BASELINE PERFORMANCE REPORT
Perfect 7-Field Ratio: 61.15%
Avg Fields/Entry:      6.03 / 7.00
----------------------------------------------------------------------
Aspect + Sentiment F1: 0.0604
Aspect-Only F1:        0.0758


In [7]:
# ============================================================================
# EVAL CELL 4: INDEPENDENT QUALITY & LOGIC JUDGE
# ============================================================================
print("⚖️ Loading independent judges...")
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')
logic_judge = CrossEncoder('cross-encoder/nli-distilroberta-base', device=device)
semantic_judge = SentenceTransformer('all-MiniLM-L6-v2', device=device)

halluc_count = 0
total_aspects = 0
faithfulness_scores = []
preds = [item['gen_output'] for item in test_predictions]
golds = [item['gold_target'] for item in test_predictions]

# 1. Linguistic Overlap
rouge_res = rouge.compute(predictions=preds, references=golds)
bs_res = bertscore.compute(predictions=preds, references=golds, lang="en", device=device)

# 2. Logic & Hallucination Analysis
for item in tqdm(test_predictions, desc="Judging Logic"):
    text = extract_text_from_prompt(item['input']).lower()
    pred_data = parse_7field_robust(item['gen_output'])
    gold_aspects = [g['aspect'] for g in parse_7field_robust(item['gold_target'])]

    for p in pred_data:
        total_aspects += 1
        # Logic Check
        if len(p['fields']) >= 4:
            nli_score = logic_judge.predict([(text, p['fields'][3])])
            faithfulness_scores.append(nli_score[0][1])

        # Semantic Hallucination Check
        if p['aspect'].replace('_', ' ') not in text:
            if gold_aspects:
                asp_emb = semantic_judge.encode([p['aspect']])
                gold_embs = semantic_judge.encode(gold_aspects)
                sim = np.max(cosine_similarity(asp_emb, gold_embs)[0])
                if sim < 0.65: halluc_count += 1
            else: halluc_count += 1

print("\n" + "="*70)
print("DEEP REASONING & QUALITY SUMMARY")
print("="*70)
print(f"BERTScore (F1):      {np.mean(bs_res['f1']):.4f}")
print(f"Logic Faithfulness:   {np.mean(faithfulness_scores):.4f}")
print(f"Hallucination Rate:   {halluc_count/total_aspects:.2%}")
print("="*70)

⚖️ Loading independent judges...


config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Judging Logic:   0%|          | 0/453 [00:00<?, ?it/s]


DEEP REASONING & QUALITY SUMMARY
BERTScore (F1):      0.8744
Logic Faithfulness:   0.1262
Hallucination Rate:   48.97%


In [8]:
# ============================================================================
# EVAL CELL 5: LINGUISTIC QUALITY (ROUGE & BERTSCORE)
# ============================================================================
rouge = evaluate.load('rouge')
bertscore = evaluate.load('bertscore')

preds = [item['gen_output'] for item in test_predictions]
golds = [item['gold_target'] for item in test_predictions]

rouge_results = rouge.compute(predictions=preds, references=golds)
bs_results = bertscore.compute(predictions=preds, references=golds, lang="en", device=device)

print("="*70)
print("GENERATION QUALITY SUMMARY")
print("="*70)
print(f"ROUGE-L Score:  {rouge_results['rougeL']:.4f}")
print(f"BERTScore (F1): {np.mean(bs_results['f1']):.4f}")
print("="*70)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GENERATION QUALITY SUMMARY
ROUGE-L Score:  0.3557
BERTScore (F1): 0.8744


In [9]:
# ============================================================================
# EVAL CELL 6: UNBIASED LOGIC & HALLUCINATION JUDGE
# ============================================================================
print("⚖️ Loading independent judges...")
logic_judge = CrossEncoder('cross-encoder/nli-distilroberta-base', device=device)
semantic_judge = SentenceTransformer('all-MiniLM-L6-v2', device=device)

halluc_count = 0
total_aspects = 0
faithfulness_scores = []

for item in tqdm(test_predictions, desc="Judging Logic"):
    text = extract_text_from_prompt(item['input']).lower()
    pred_data = parse_7field_robust(item['gen_output'])
    gold_data = parse_7field_robust(item['gold_target'])
    gold_aspects = [g['aspect'] for g in gold_data]

    for p in pred_data:
        total_aspects += 1

        # 1. Logic Check: Does input text support the rationale (Field 4)?
        if len(p['fields']) >= 4:
            rationale = p['fields'][3]
            nli_score = logic_judge.predict([(text, rationale)])
            faithfulness_scores.append(nli_score[0][1]) # Entailment score

        # 2. Semantic Hallucination: Is aspect in text or semantically close to gold?
        if p['aspect'].replace('_', ' ') not in text:
            if gold_aspects:
                asp_emb = semantic_judge.encode([p['aspect']])
                gold_embs = semantic_judge.encode(gold_aspects)
                sim = np.max(cosine_similarity(asp_emb, gold_embs)[0])
                if sim < 0.65: halluc_count += 1
            else:
                halluc_count += 1

print("\n" + "="*70)
print("REASONING & RELIABILITY REPORT")
print("="*70)
print(f"Rationale Faithfulness (NLI): {np.mean(faithfulness_scores):.4f}")
print(f"Semantic Hallucination Rate:  {halluc_count/total_aspects:.2%}")
print("="*70)

⚖️ Loading independent judges...


Judging Logic:   0%|          | 0/453 [00:00<?, ?it/s]


REASONING & RELIABILITY REPORT
Rationale Faithfulness (NLI): 0.1262
Semantic Hallucination Rate:  48.97%


In [10]:
# ============================================================================
# EVAL CELL 7: COMPREHENSIVE FINAL REPORT
# ============================================================================
print("\n" + "="*70)
print("ABSA FINAL EVALUATION SUMMARY")
print("="*70)
print(f"1. Combined F1:      {f1_tot:.4f}")
print(f"2. Aspect-Only F1:   {f1_asp:.4f}")
print(f"3. BERTScore:        {np.mean(bs_results['f1']):.4f}")
print(f"4. Hallucination:    {halluc_count/total_aspects:.2%}")
print(f"5. Reasoning Depth:  {np.mean(faithfulness_scores):.4f}")
print("="*70)


ABSA FINAL EVALUATION SUMMARY
1. Combined F1:      0.0604
2. Aspect-Only F1:   0.0758
3. BERTScore:        0.8744
4. Hallucination:    48.97%
5. Reasoning Depth:  0.1262
