In [1]:
#%%
# =======================================================================
# CELL 0: ONE-SHOT SETUP WITH KERNEL RESTART (REQUIRED FOR PROTOBUF)
# =======================================================================
print("="*70)
print("COMPLETE SETUP - PROTOBUF FIX + KERNEL RESTART")
print("="*70)

import subprocess
import sys
import os

# Step 1: Uninstall protobuf completely
print("[1/4] Removing old protobuf...")
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "protobuf"], 
               capture_output=True, check=False)
subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "protobuf"], 
               capture_output=True, check=False)  # Run twice to ensure clean
print("      ‚úì Old protobuf removed")

# Step 2: Install correct version
print("[2/4] Installing protobuf 3.20.3...")
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "protobuf==3.20.3"], 
               capture_output=True, check=True)
print("      ‚úì Protobuf 3.20.3 installed")

# Step 3: Install other packages
print("[3/4] Installing packages...")
subprocess.run([
    sys.executable, "-m", "pip", "install", "-q",
    "pandas", "numpy", "scikit-learn",
    "transformers", "torch", "accelerate", "peft",
    "hdbscan", "umap-learn", "datasets", "sentencepiece", "evaluate" , "rouge_score"  , "datasets"
], capture_output=True, check=True)
print("      ‚úì Packages installed")

# Step 4: Restart kernel
print("[4/4] Restarting kernel...")
print("\n" + "="*70)
print("‚ö†Ô∏è  KERNEL WILL RESTART NOW")
print("‚ö†Ô∏è  AFTER RESTART, RUN CELL 1 (NOT THIS CELL AGAIN)")
print("="*70)

# Restart IPython kernel
# os._exit(0)

COMPLETE SETUP - PROTOBUF FIX + KERNEL RESTART
[1/4] Removing old protobuf...
      ‚úì Old protobuf removed
[2/4] Installing protobuf 3.20.3...
      ‚úì Protobuf 3.20.3 installed
[3/4] Installing packages...
      ‚úì Packages installed
[4/4] Restarting kernel...

‚ö†Ô∏è  KERNEL WILL RESTART NOW
‚ö†Ô∏è  AFTER RESTART, RUN CELL 1 (NOT THIS CELL AGAIN)


In [2]:
#%%
# =======================================================================
# CELL 1: IMPORTS (RUN AFTER KERNEL RESTART)
# =======================================================================
print("="*70)
print("CELL 1: IMPORTING LIBRARIES")
print("="*70)

import json
import pandas as pd
import numpy as np
import os
from collections import defaultdict, Counter
import torch
from transformers import T5EncoderModel, AutoTokenizer, T5TokenizerFast, T5ForConditionalGeneration
from peft import PeftModel
from hdbscan import HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All libraries imported successfully")

# GPU check
if torch.cuda.is_available():
    print(f"‚úÖ GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("‚ö†Ô∏è No GPU - using CPU")
    device = torch.device("cpu")

print("\n" + "="*70)
print("‚úÖ CELL 1 COMPLETE - Proceed to Cell 2")
print("="*70)

CELL 1: IMPORTING LIBRARIES


2026-01-27 05:56:22.485807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769493382.706440      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769493382.773551      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769493383.319557      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769493383.319597      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769493383.319600      24 computation_placer.cc:177] computation placer alr

‚úÖ All libraries imported successfully
‚úÖ GPU: Tesla T4

‚úÖ CELL 1 COMPLETE - Proceed to Cell 2


In [3]:
# =======================================================================
# CELL 2: CONFIGURATION - MEMORY-SAFE FOR LONGER SEQUENCES
# =======================================================================

from pathlib import Path
import torch
import os

print("="*80)
print("CONFIGURATION - MEMORY OPTIMIZED")
print("="*80)

# PATHS
TSV_DATA_DIR = Path("/kaggle/input/new-absa-baseline-few-shots")
DAPT_ADAPTER_DIR = Path("/kaggle/input/t5-base-700k-masked-span/flan-t5-base-social-lora-masked-span")
OUTPUT_MODEL_DIR = Path("/kaggle/working/flan-t5-dapt-fewshot-baseline")
OUTPUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)

BASE_MODEL_NAME = "google/flan-t5-base"

# =====================================================================
# MEMORY-SAFE HYPERPARAMETERS FOR 768 INPUT TOKENS ON T4
# =====================================================================

# Token lengths
MAX_INPUT_LENGTH = 768    # Required for few-shot prompts
MAX_TARGET_LENGTH = 256   # Required for long rationales

# CRITICAL: Reduced batch size for 768 tokens
PER_DEVICE_TRAIN_BATCH_SIZE = 2    # ‚ö†Ô∏è REDUCED from 5 to 2
GRADIENT_ACCUMULATION_STEPS = 16   # ‚ö†Ô∏è INCREASED to maintain effective batch
PER_DEVICE_EVAL_BATCH_SIZE = 2     # ‚ö†Ô∏è REDUCED

# Learning parameters
LEARNING_RATE = 2.0e-4
WEIGHT_DECAY = 0.0
WARMUP_RATIO = 0.05
LR_SCHEDULER = "cosine"
OPTIMIZER = "adamw_torch"

# Training duration
NUM_TRAIN_EPOCHS = 7

# Generation settings
GEN_NUM_BEAMS = 4  # Reduced from 5 for memory
GEN_MAX_LENGTH = MAX_TARGET_LENGTH

# Logging
LOGGING_STEPS = 50

# CRITICAL: Enable gradient checkpointing and FP16
FP16 = True  # ‚ö†Ô∏è ENABLED for memory savings
GRADIENT_CHECKPOINTING = True  # ‚ö†Ô∏è CRITICAL for memory

# Seed
SEED = 42

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"\nüìÅ PATHS:")
print(f"  TSV data: {TSV_DATA_DIR}")
print(f"  DAPT adapter: {DAPT_ADAPTER_DIR}")
print(f"  Output: {OUTPUT_MODEL_DIR}")

print(f"\nüìä MEMORY-OPTIMIZED HYPERPARAMETERS:")
print(f"  Max Input Length: {MAX_INPUT_LENGTH} tokens")
print(f"  Max Target Length: {MAX_TARGET_LENGTH} tokens")
print(f"  Batch Size: {PER_DEVICE_TRAIN_BATCH_SIZE} (reduced for memory)")
print(f"  Gradient Accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effective Batch Size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  FP16: {FP16} (enabled for memory)")
print(f"  Gradient Checkpointing: {GRADIENT_CHECKPOINTING} (critical)")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Epochs: {NUM_TRAIN_EPOCHS}")

print("\n" + "="*80)
print("‚úÖ CELL 2 COMPLETE")
print("="*80)

CONFIGURATION - MEMORY OPTIMIZED

üìÅ PATHS:
  TSV data: /kaggle/input/new-absa-baseline-few-shots
  DAPT adapter: /kaggle/input/t5-base-700k-masked-span/flan-t5-base-social-lora-masked-span
  Output: /kaggle/working/flan-t5-dapt-fewshot-baseline

üìä MEMORY-OPTIMIZED HYPERPARAMETERS:
  Max Input Length: 768 tokens
  Max Target Length: 256 tokens
  Batch Size: 2 (reduced for memory)
  Gradient Accumulation: 16
  Effective Batch Size: 32
  FP16: True (enabled for memory)
  Gradient Checkpointing: True (critical)
  Learning Rate: 0.0002
  Epochs: 7

‚úÖ CELL 2 COMPLETE


In [4]:
# =======================================================================
# CELL 3: LOAD AND VERIFY TSV DATA
# =======================================================================

import csv
import pandas as pd
from datasets import Dataset
from pathlib import Path
import re

print("="*80)
print("LOADING TSV DATA")
print("="*80)

def load_tsv_simple(path):
    """Load TSV with input_text/target_text columns."""
    print(f"Reading from {path.name}...")
    rows = []
    with open(path, 'r', encoding='utf-8') as fh:
        reader = csv.reader(fh, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
        header = next(reader, None)  # Skip header
        for row in reader:
            if len(row) >= 2:
                rows.append({
                    'input': row[0],    # Maps "input_text" ‚Üí "input"
                    'target': row[1]    # Maps "target_text" ‚Üí "target"
                })
    print(f"‚úì Successfully loaded {len(rows)} rows from {path.name}")
    return Dataset.from_pandas(pd.DataFrame(rows))

def extract_text_from_prompt(prompt_input):
    """Extract the actual text being analyzed from the prompt."""
    all_matches = list(re.finditer(r'Text:\s*(.*?)\s*(?:Output:|$)', prompt_input, re.DOTALL))
    if all_matches:
        return all_matches[-1].group(1).strip()
    return "Could not extract text"

# Load datasets
train_ds = load_tsv_simple(TSV_DATA_DIR / "train.tsv")
val_ds = load_tsv_simple(TSV_DATA_DIR / "val.tsv")
test_ds = load_tsv_simple(TSV_DATA_DIR / "test.tsv")

print(f"\n" + "="*80)
print(f"DATASET SIZES:")
print(f"  Train: {len(train_ds)}")
print(f"  Val:   {len(val_ds)}")
print(f"  Test:  {len(test_ds)}")
print("="*80)

# Quick verification
sample = train_ds[0]
actual_text = extract_text_from_prompt(sample['input'])

print(f"\nüìù SAMPLE VERIFICATION:")
print(f"  Input length: {len(sample['input'])} chars")
print(f"  Target length: {len(sample['target'])} chars")
print(f"  Extracted text: {actual_text[:100]}...")
print(f"  Target: {sample['target'][:100]}...")

print("\n" + "="*80)
print("‚úÖ CELL 3 COMPLETE - Proceed to Cell 4")
print("="*80)


LOADING TSV DATA
Reading from train.tsv...
‚úì Successfully loaded 2114 rows from train.tsv
Reading from val.tsv...
‚úì Successfully loaded 452 rows from val.tsv
Reading from test.tsv...
‚úì Successfully loaded 453 rows from test.tsv

DATASET SIZES:
  Train: 2114
  Val:   452
  Test:  453

üìù SAMPLE VERIFICATION:
  Input length: 2407 chars
  Target length: 130 chars
  Extracted text: public transport fare hiked, services same no upgrades...
  Target: public_transport | negative | public transport fare | Explicit negative: fare 'hiked' without improv...

‚úÖ CELL 3 COMPLETE - Proceed to Cell 4


In [5]:
# =======================================================================
# IMPLICIT vs EXPLICIT RATIO CHECK
# =======================================================================

print("="*80)
print("CHECKING IMPLICIT vs EXPLICIT DISTRIBUTION")
print("="*80)

import re

def check_aspect_in_text(aspect, text):
    """Check if aspect keywords appear in text."""
    # Normalize
    aspect_clean = aspect.lower().strip()
    text_clean = text.lower()
    
    # Split aspect by underscore (e.g., "service_speed" ‚Üí ["service", "speed"])
    keywords = aspect_clean.replace('_', ' ').split()
    
    # Check if ANY keyword (>3 chars) appears in text
    found = any(keyword in text_clean for keyword in keywords if len(keyword) > 3)
    return found

# Analyze train dataset
explicit_count = 0
implicit_count = 0
compound_aspects = []  # Track compound aspect names

for i, sample in enumerate(train_ds):
    actual_text = extract_text_from_prompt(sample['input'])
    target = sample['target']
    
    # Parse all aspects from target (may have multiple)
    aspect_entries = target.split(' ; ')
    
    for entry in aspect_entries:
        fields = entry.split(' | ')
        if len(fields) >= 1:
            aspect = fields[0].strip()
            
            # Check if it's a compound aspect (3+ words)
            if len(aspect.split('_')) >= 3:
                compound_aspects.append(aspect)
            
            # Check if aspect appears in text
            if check_aspect_in_text(aspect, actual_text):
                explicit_count += 1
            else:
                implicit_count += 1

total = explicit_count + implicit_count
explicit_pct = explicit_count / total * 100 if total > 0 else 0
implicit_pct = implicit_count / total * 100 if total > 0 else 0

print(f"\nüìä ASPECT DISTRIBUTION (Train Set):")
print(f"  Total aspects: {total}")
print(f"  Explicit (keyword in text): {explicit_count} ({explicit_pct:.1f}%)")
print(f"  Implicit (keyword NOT in text): {implicit_count} ({implicit_pct:.1f}%)")

print(f"\nüîç COMPOUND ASPECT NAMES (3+ words):")
print(f"  Count: {len(compound_aspects)}")
if compound_aspects:
    from collections import Counter
    most_common = Counter(compound_aspects).most_common(10)
    print(f"  Top 10:")
    for aspect, count in most_common:
        print(f"    {aspect:<40} : {count}")

print("\n" + "="*80)
print("INTERPRETATION:")
print("="*80)

if implicit_pct > 50:
    print("‚ö†Ô∏è  HIGH IMPLICIT RATE (>50%)")
    print("   This could be due to:")
    print("   1. Abstract aspect naming (e.g., 'maintenance_anxiety_foreign')")
    print("   2. Intentional focus on implicit reasoning")
    print("   3. Compound aspects that don't appear literally")
    print("\n   This is FINE but means:")
    print("   ‚úì Model must learn strong inference")
    print("   ‚úì Training may take longer")
    print("   ‚úì Results will test implicit understanding")
elif implicit_pct > 30:
    print("‚úÖ BALANCED IMPLICIT RATE (30-50%)")
    print("   Good mix of explicit and implicit aspects")
elif implicit_pct > 10:
    print("‚úÖ NORMAL IMPLICIT RATE (10-30%)")
    print("   Typical ABSA distribution")
else:
    print("‚úÖ LOW IMPLICIT RATE (<10%)")
    print("   Mostly explicit aspects - easier task")

print("="*80)

# Show some examples of implicit aspects
print("\nüìù EXAMPLES OF IMPLICIT ASPECTS (First 5):")
print("="*80)

implicit_examples = []
for i, sample in enumerate(train_ds):
    actual_text = extract_text_from_prompt(sample['input'])
    target = sample['target']
    
    aspect_entries = target.split(' ; ')
    for entry in aspect_entries:
        fields = entry.split(' | ')
        if len(fields) >= 1:
            aspect = fields[0].strip()
            if not check_aspect_in_text(aspect, actual_text):
                implicit_examples.append({
                    'aspect': aspect,
                    'text': actual_text[:150],
                    'target': entry[:150]
                })
                if len(implicit_examples) >= 5:
                    break
    if len(implicit_examples) >= 5:
        break

for i, ex in enumerate(implicit_examples, 1):
    print(f"\n{i}. Aspect: {ex['aspect']}")
    print(f"   Text: {ex['text']}...")
    print(f"   Target: {ex['target']}...")

print("\n" + "="*80)


CHECKING IMPLICIT vs EXPLICIT DISTRIBUTION

üìä ASPECT DISTRIBUTION (Train Set):
  Total aspects: 2750
  Explicit (keyword in text): 1048 (38.1%)
  Implicit (keyword NOT in text): 1702 (61.9%)

üîç COMPOUND ASPECT NAMES (3+ words):
  Count: 166
  Top 10:
    cost_of_living                           : 7
    work_life_balance                        : 6
    value_for_money                          : 5
    item_condition_on_arrival                : 4
    support_wait_time                        : 2
    social_media_strategy                    : 2
    hotel_room_cleanliness                   : 2
    maintenance_anxiety_foreign              : 1
    return_policy_clarity                    : 1
    venue_sound_system                       : 1

INTERPRETATION:
‚ö†Ô∏è  HIGH IMPLICIT RATE (>50%)
   This could be due to:
   1. Abstract aspect naming (e.g., 'maintenance_anxiety_foreign')
   2. Intentional focus on implicit reasoning
   3. Compound aspects that don't appear literally

   This is F

In [6]:
# =======================================================================
# CELL 4: LOAD TOKENIZER AND DAPT MODEL (MEMORY OPTIMIZED)
# =======================================================================

from transformers import T5TokenizerFast, T5ForConditionalGeneration
from peft import PeftModel
import gc

print("="*80)
print("LOADING TOKENIZER AND DAPT MODEL")
print("="*80)

# Clear any existing memory
gc.collect()
torch.cuda.empty_cache()

# 1. Load tokenizer
print("\n[1/5] Loading tokenizer...")
tokenizer = T5TokenizerFast.from_pretrained(DAPT_ADAPTER_DIR)
print("‚úì Tokenizer loaded")

# 2. Load base model
print("\n[2/5] Loading base model...")
base_model = T5ForConditionalGeneration.from_pretrained(BASE_MODEL_NAME)
print("‚úì Base model loaded")

# 3. Load DAPT adapter
print("\n[3/5] Loading DAPT adapter...")
model_with_dapt = PeftModel.from_pretrained(base_model, DAPT_ADAPTER_DIR)
print("‚úì DAPT adapter loaded")

# 4. Merge adapter into base model
print("\n[4/5] Merging DAPT adapter weights...")
model = model_with_dapt.merge_and_unload()

# Clean up intermediate models
del base_model
del model_with_dapt
gc.collect()
torch.cuda.empty_cache()
print("‚úì Adapter merged and memory cleaned")

# 5. Enable gradient checkpointing BEFORE moving to GPU
print("\n[5/5] Enabling gradient checkpointing...")
model.gradient_checkpointing_enable()
print("‚úì Gradient checkpointing enabled")

# Unfreeze all parameters for full fine-tuning
for param in model.parameters():
    param.requires_grad = True

# Move to GPU
model.to(device)

# Verify
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("\n" + "="*80)
print("MODEL READY")
print("="*80)
print(f"  Total params: {total_params:,}")
print(f"  Trainable params: {trainable_params:,}")
print(f"  Gradient checkpointing: ENABLED")
print(f"  Device: {device}")

# Check GPU memory
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated() / 1e9
    reserved = torch.cuda.memory_reserved() / 1e9
    print(f"  GPU Memory Allocated: {allocated:.2f} GB")
    print(f"  GPU Memory Reserved: {reserved:.2f} GB")

print("="*80)
print("\n‚úÖ CELL 4 COMPLETE")

LOADING TOKENIZER AND DAPT MODEL

[1/5] Loading tokenizer...


You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


‚úì Tokenizer loaded

[2/5] Loading base model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

‚úì Base model loaded

[3/5] Loading DAPT adapter...
‚úì DAPT adapter loaded

[4/5] Merging DAPT adapter weights...
‚úì Adapter merged and memory cleaned

[5/5] Enabling gradient checkpointing...
‚úì Gradient checkpointing enabled

MODEL READY
  Total params: 247,577,856
  Trainable params: 247,577,856
  Gradient checkpointing: ENABLED
  Device: cuda
  GPU Memory Allocated: 0.99 GB
  GPU Memory Reserved: 1.06 GB

‚úÖ CELL 4 COMPLETE


In [7]:
# =======================================================================
# CELL 5: TOKENIZATION
# =======================================================================

print("="*80)
print("TOKENIZING DATASETS")
print("="*80)

def preprocess_function(batch):
    """Tokenize input/target pairs."""
    # Tokenize inputs (prompts)
    inputs = tokenizer(
        batch['input'],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding=False  # Dynamic padding in collator
    )
    
    # Tokenize targets (7-field format)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            batch['target'],
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding=False
        )
    
    inputs['labels'] = labels['input_ids']
    return inputs

# Tokenize all datasets
print("\n[1/3] Tokenizing train set...")
tokenized_train = train_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=train_ds.column_names,
    desc="Tokenizing train"
)

print("\n[2/3] Tokenizing validation set...")
tokenized_val = val_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=val_ds.column_names,
    desc="Tokenizing val"
)

print("\n[3/3] Tokenizing test set...")
tokenized_test = test_ds.map(
    preprocess_function,
    batched=True,
    remove_columns=test_ds.column_names,
    desc="Tokenizing test"
)

# Check token lengths
train_input_lens = [len(x) for x in tokenized_train['input_ids']]
train_label_lens = [len(x) for x in tokenized_train['labels']]

print("\n" + "="*80)
print("TOKENIZATION STATISTICS")
print("="*80)
print(f"  Input tokens  - Min: {min(train_input_lens)}, Max: {max(train_input_lens)}, Avg: {sum(train_input_lens)/len(train_input_lens):.1f}")
print(f"  Target tokens - Min: {min(train_label_lens)}, Max: {max(train_label_lens)}, Avg: {sum(train_label_lens)/len(train_label_lens):.1f}")

truncated_inputs = sum(1 for l in train_input_lens if l >= MAX_INPUT_LENGTH)
truncated_targets = sum(1 for l in train_label_lens if l >= MAX_TARGET_LENGTH)

print(f"\n  Truncated:")
print(f"    Inputs: {truncated_inputs}/{len(train_input_lens)} ({truncated_inputs/len(train_input_lens)*100:.1f}%)")
print(f"    Targets: {truncated_targets}/{len(train_label_lens)} ({truncated_targets/len(train_label_lens)*100:.1f}%)")
print("="*80)

print("\n‚úÖ CELL 5 COMPLETE - Proceed to Cell 6")


TOKENIZING DATASETS

[1/3] Tokenizing train set...


Tokenizing train:   0%|          | 0/2114 [00:00<?, ? examples/s]


[2/3] Tokenizing validation set...


Tokenizing val:   0%|          | 0/452 [00:00<?, ? examples/s]


[3/3] Tokenizing test set...


Tokenizing test:   0%|          | 0/453 [00:00<?, ? examples/s]


TOKENIZATION STATISTICS
  Input tokens  - Min: 602, Max: 768, Avg: 623.5
  Target tokens - Min: 24, Max: 256, Avg: 70.4

  Truncated:
    Inputs: 20/2114 (0.9%)
    Targets: 7/2114 (0.3%)

‚úÖ CELL 5 COMPLETE - Proceed to Cell 6


In [8]:
# =======================================================================
# CELL 6: TRAINING - MEMORY OPTIMIZED BASELINE
# =======================================================================

from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
    DataCollatorForSeq2Seq
)
import time

print("="*80)
print("BASELINE TRAINING - MEMORY OPTIMIZED")
print("="*80)

# Data Collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    pad_to_multiple_of=8  # Helps with memory alignment
)

# Training Arguments - MEMORY OPTIMIZED
training_args = Seq2SeqTrainingArguments(
    output_dir=str(OUTPUT_MODEL_DIR),
    
    # Batch sizes - REDUCED FOR MEMORY
    per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,  # 2
    per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,    # 2
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,   # 16
    
    # Learning rate and optimizer
    learning_rate=LEARNING_RATE,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    optim=OPTIMIZER,
    
    # Training duration
    num_train_epochs=NUM_TRAIN_EPOCHS,
    
    # Evaluation and checkpointing
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Generation settings
    predict_with_generate=True,
    generation_max_length=GEN_MAX_LENGTH,
    generation_num_beams=GEN_NUM_BEAMS,  # 4 (reduced)
    
    # MEMORY OPTIMIZATION - CRITICAL
    fp16=FP16,  # True
    gradient_checkpointing=True,  # Already enabled on model, but set here too
    
    # Logging
    logging_dir=f"{OUTPUT_MODEL_DIR}/logs",
    logging_strategy="steps",
    logging_steps=LOGGING_STEPS,
    logging_first_step=True,
    report_to="none",
    
    # MEMORY OPTIMIZATION - Disable parallel data loading
    dataloader_num_workers=0,  # ‚ö†Ô∏è Set to 0 to avoid memory issues
    dataloader_pin_memory=False,  # ‚ö†Ô∏è Disabled for memory
    
    # Reproducibility
    seed=SEED,
    data_seed=SEED,
)

# Early Stopping
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=2,
    early_stopping_threshold=0.01
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[early_stopping],
)

# Training Info
steps_per_epoch = len(tokenized_train) // (PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS)
total_steps = steps_per_epoch * NUM_TRAIN_EPOCHS

print(f"\nüìä TRAINING CONFIGURATION:")
print(f"  Model: FLAN-T5-Base + DAPT + Few-Shot")
print(f"  Dataset: {len(tokenized_train)} train, {len(tokenized_val)} val")
print(f"  Batch Size: {PER_DEVICE_TRAIN_BATCH_SIZE}")
print(f"  Gradient Accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effective Batch Size: {PER_DEVICE_TRAIN_BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"  FP16: {FP16}")
print(f"  Gradient Checkpointing: ENABLED")
print(f"  Steps per epoch: {steps_per_epoch}")
print(f"  Total steps: {total_steps}")
print()

# Clear memory before training
gc.collect()
torch.cuda.empty_cache()

print("="*80)
print("üöÄ STARTING TRAINING")
print("="*80)

start_time = time.time()
train_result = trainer.train()
training_time = time.time() - start_time

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE")
print("="*80)
print(f"‚è±Ô∏è  Training time: {training_time/60:.1f} minutes")
print(f"üìâ Final train loss: {train_result.metrics.get('train_loss', 'N/A'):.4f}")
print("="*80)

# Save Model
print("\nüíæ Saving model and tokenizer...")
trainer.save_model(OUTPUT_MODEL_DIR)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR)
print(f"‚úì Model saved to: {OUTPUT_MODEL_DIR}")

print("\n‚úÖ CELL 6 COMPLETE - Proceed to Evaluation")

BASELINE TRAINING - MEMORY OPTIMIZED

üìä TRAINING CONFIGURATION:
  Model: FLAN-T5-Base + DAPT + Few-Shot
  Dataset: 2114 train, 452 val
  Batch Size: 2
  Gradient Accumulation: 16
  Effective Batch Size: 32
  FP16: True
  Gradient Checkpointing: ENABLED
  Steps per epoch: 66
  Total steps: 462

üöÄ STARTING TRAINING


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,10.406,0.940727
2,1.8729,0.799984
3,0.8816,0.73812
4,0.8816,0.715936
5,0.7531,0.704411
6,0.718,0.701157
7,0.718,0.698501


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].



‚úÖ TRAINING COMPLETE
‚è±Ô∏è  Training time: 382.1 minutes
üìâ Final train loss: 1.0359

üíæ Saving model and tokenizer...
‚úì Model saved to: /kaggle/working/flan-t5-dapt-fewshot-baseline

‚úÖ CELL 6 COMPLETE - Proceed to Evaluation


In [9]:
# =======================================================================
# CELL 7: GENERATE PREDICTIONS ON TEST SET
# =======================================================================

import torch
from tqdm.notebook import tqdm
import json

print("="*80)
print("GENERATING PREDICTIONS ON TEST SET")
print("="*80)

# Load best model
print(f"\nLoading best model from {OUTPUT_MODEL_DIR}...")
model = T5ForConditionalGeneration.from_pretrained(OUTPUT_MODEL_DIR).to(device)
tokenizer = T5TokenizerFast.from_pretrained(OUTPUT_MODEL_DIR)
model.eval()
print("‚úì Model loaded")

def generate_predictions(dataset, tokenizer, model, device, batch_size=PER_DEVICE_EVAL_BATCH_SIZE):
    """Generate predictions with progress bar."""
    all_preds = []
    
    for i in tqdm(range(0, len(dataset), batch_size), desc="Generating"):
        batch = dataset[i:i + batch_size]
        
        input_texts = batch['input']
        gold_targets = batch['target']
        
        # Tokenize inputs
        inputs = tokenizer(
            input_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_INPUT_LENGTH
        ).to(device)
        
        # Generate
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                num_beams=GEN_NUM_BEAMS,
                max_length=MAX_TARGET_LENGTH,
                early_stopping=True
            )
        
        # Decode
        generated_texts = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # Store results
        for j in range(len(input_texts)):
            all_preds.append({
                'input': input_texts[j],
                'gold_target': gold_targets[j],
                'gen_output': generated_texts[j]
            })
    
    return all_preds

# Generate predictions
print(f"\nGenerating predictions on {len(test_ds)} test samples...")
test_predictions = generate_predictions(test_ds, tokenizer, model, device)

# Save predictions
PRED_FILE = OUTPUT_MODEL_DIR / "test_predictions.jsonl"
with open(PRED_FILE, 'w', encoding='utf-8') as f:
    for pred in test_predictions:
        f.write(json.dumps(pred, ensure_ascii=False) + '\n')

print(f"‚úì Predictions saved to: {PRED_FILE}")
print("\n" + "="*80)
print("‚úÖ CELL 7 COMPLETE - Proceed to Cell 8")
print("="*80)


GENERATING PREDICTIONS ON TEST SET

Loading best model from /kaggle/working/flan-t5-dapt-fewshot-baseline...
‚úì Model loaded

Generating predictions on 453 test samples...


Generating:   0%|          | 0/227 [00:00<?, ?it/s]

‚úì Predictions saved to: /kaggle/working/flan-t5-dapt-fewshot-baseline/test_predictions.jsonl

‚úÖ CELL 7 COMPLETE - Proceed to Cell 8


In [10]:
# =======================================================================
# CELL 8: EVALUATE PREDICTIONS - 7-FIELD ABSA FORMAT
# =======================================================================

import re
from collections import Counter

print("="*80)
print("EVALUATION - 7-FIELD ABSA FORMAT")
print("="*80)

# =====================================================================
# PARSING FUNCTIONS
# =====================================================================

def normalize_aspect(s):
    """Normalize aspect term."""
    if not s:
        return ""
    return re.sub(r'\s+', ' ', s.strip().lower())

def normalize_sentiment(s):
    """Normalize sentiment to positive/negative/neutral."""
    if not s:
        return s
    s = s.strip().lower()
    if s.startswith('pos'):
        return 'positive'
    if s.startswith('neg'):
        return 'negative'
    if s.startswith('neu'):
        return 'neutral'
    return s

def parse_7field_output(text):
    """Parse 7-field format: aspect | sentiment | span | rationale | is_implicit | has_slang | has_emoji
    Returns set of (aspect, sentiment) tuples for matching."""
    pairs = set()
    if not text or not text.strip():
        return pairs
    
    # Split by semicolon separator
    entries = re.split(r'\s*;\s*', text.strip())
    
    for entry in entries:
        # Split by pipe to get fields
        fields = [f.strip() for f in entry.split('|')]
        
        if len(fields) >= 2:
            aspect = normalize_aspect(fields[0])
            sentiment = normalize_sentiment(fields[1])
            
            if aspect and sentiment in ['positive', 'negative', 'neutral']:
                pairs.add((aspect, sentiment))
    
    return pairs

# =====================================================================
# COMPUTE METRICS
# =====================================================================

def compute_prf(tp, fp, fn):
    """Compute Precision, Recall, F1."""
    p = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    r = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f = 2 * p * r / (p + r) if (p + r) > 0 else 0.0
    return p, r, f

# =====================================================================
# EVALUATION LOOP
# =====================================================================

# Counters
TP = FP = FN = 0  # Aspect+Sentiment tuple matching
TP_aspect = FP_aspect = FN_aspect = 0  # Aspect-only
exact_match = 0  # Exact match samples
total_samples = len(test_predictions)

# Sentiment accuracy on detected aspects
sentiment_correct = 0
detected_aspects = 0

# Per-aspect stats
aspect_counts_gold = Counter()
aspect_counts_pred = Counter()
aspect_tp = Counter()

print(f"\nEvaluating {total_samples} test samples...")

for item in test_predictions:
    gold_text = item['gold_target']
    pred_text = item['gen_output']
    
    # Parse both
    gold_pairs = parse_7field_output(gold_text)
    pred_pairs = parse_7field_output(pred_text)
    
    # Tuple-level (aspect + sentiment)
    TP += len(gold_pairs & pred_pairs)
    FP += len(pred_pairs - gold_pairs)
    FN += len(gold_pairs - pred_pairs)
    
    # Exact match
    if gold_pairs == pred_pairs:
        exact_match += 1
    
    # Aspect-only (ignore sentiment)
    gold_aspects = set(a for a, s in gold_pairs)
    pred_aspects = set(a for a, s in pred_pairs)
    
    TP_aspect += len(gold_aspects & pred_aspects)
    FP_aspect += len(pred_aspects - gold_aspects)
    FN_aspect += len(gold_aspects - pred_aspects)
    
    # Per-aspect counts
    for asp in gold_aspects:
        aspect_counts_gold[asp] += 1
    for asp in pred_aspects:
        aspect_counts_pred[asp] += 1
    for asp in gold_aspects & pred_aspects:
        aspect_tp[asp] += 1
    
    # Sentiment accuracy on detected aspects
    for asp in gold_aspects & pred_aspects:
        detected_aspects += 1
        gold_sents = set(s for a, s in gold_pairs if a == asp)
        pred_sents = set(s for a, s in pred_pairs if a == asp)
        if gold_sents == pred_sents:
            sentiment_correct += 1

# =====================================================================
# COMPUTE METRICS
# =====================================================================

tuple_p, tuple_r, tuple_f1 = compute_prf(TP, FP, FN)
aspect_p, aspect_r, aspect_f1 = compute_prf(TP_aspect, FP_aspect, FN_aspect)
exact_acc = exact_match / total_samples if total_samples > 0 else 0.0
sent_acc = sentiment_correct / detected_aspects if detected_aspects > 0 else 0.0

# =====================================================================
# DISPLAY RESULTS
# =====================================================================

print("\n" + "="*80)
print("PRIMARY METRICS")
print("="*80)

print(f"\n1. Aspect + Sentiment Tuple Matching:")
print(f"   TP: {TP}, FP: {FP}, FN: {FN}")
print(f"   Precision: {tuple_p:.4f}")
print(f"   Recall: {tuple_r:.4f}")
print(f"   F1 Score: {tuple_f1:.4f}")

print(f"\n2. Aspect-Only (Ignore Sentiment):")
print(f"   TP: {TP_aspect}, FP: {FP_aspect}, FN: {FN_aspect}")
print(f"   Precision: {aspect_p:.4f}")
print(f"   Recall: {aspect_r:.4f}")
print(f"   F1 Score: {aspect_f1:.4f}")

print(f"\n3. Exact Match Accuracy:")
print(f"   {exact_match}/{total_samples} = {exact_acc:.4f} ({exact_acc*100:.2f}%)")

print(f"\n4. Sentiment Accuracy (on detected aspects):")
print(f"   {sentiment_correct}/{detected_aspects} = {sent_acc:.4f} ({sent_acc*100:.2f}%)")

# =====================================================================
# TOP ASPECTS
# =====================================================================

print("\n" + "="*80)
print("TOP 10 ASPECTS")
print("="*80)
print(f"{'Aspect':<30} {'Gold':<6} {'Pred':<6} {'TP':<6} {'F1':<8}")
print("-"*70)

top_aspects = aspect_counts_gold.most_common(10)
for asp, gold_count in top_aspects:
    pred_count = aspect_counts_pred.get(asp, 0)
    tp_count = aspect_tp.get(asp, 0)
    
    p = tp_count / pred_count if pred_count > 0 else 0.0
    r = tp_count / gold_count if gold_count > 0 else 0.0
    f1 = 2 * p * r / (p + r) if (p + r) > 0 else 0.0
    
    print(f"{asp:<30} {gold_count:<6} {pred_count:<6} {tp_count:<6} {f1:<8.4f}")

print("\n" + "="*80)
print("‚úÖ CELL 8 COMPLETE - Proceed to Cell 9")
print("="*80)


EVALUATION - 7-FIELD ABSA FORMAT

Evaluating 453 test samples...

PRIMARY METRICS

1. Aspect + Sentiment Tuple Matching:
   TP: 176, FP: 370, FN: 415
   Precision: 0.3223
   Recall: 0.2978
   F1 Score: 0.3096

2. Aspect-Only (Ignore Sentiment):
   TP: 185, FP: 361, FN: 406
   Precision: 0.3388
   Recall: 0.3130
   F1 Score: 0.3254

3. Exact Match Accuracy:
   97/453 = 0.2141 (21.41%)

4. Sentiment Accuracy (on detected aspects):
   176/185 = 0.9514 (95.14%)

TOP 10 ASPECTS
Aspect                         Gold   Pred   TP     F1      
----------------------------------------------------------------------
order_accuracy                 44     87     39     0.5954  
platform_accessibility         42     66     38     0.7037  
service_speed                  19     9      4      0.2857  
ingredient_freshness           14     10     2      0.1667  
transaction_costs              13     9      3      0.2727  
pricing_fairness               12     1      0      0.0000  
software_stability      

In [11]:
# =======================================================================
# CELL 9: GENERATION QUALITY METRICS (ROUGE, BLEU, METEOR, BERTScore)
# =======================================================================

print("="*80)
print("GENERATION QUALITY METRICS")
print("="*80)

gold_strings = [item['gold_target'] for item in test_predictions]
pred_strings = [item['gen_output'] for item in test_predictions]

# =====================================================================
# 1. ROUGE SCORES
# =====================================================================

print("\n[1/4] Computing ROUGE scores...")
try:
    import evaluate
    rouge = evaluate.load('rouge')
    rouge_results = rouge.compute(
        predictions=pred_strings,
        references=gold_strings,
        use_stemmer=True
    )
    print("‚úì ROUGE Scores:")
    print(f"   ROUGE-1: {rouge_results['rouge1']:.4f}")
    print(f"   ROUGE-2: {rouge_results['rouge2']:.4f}")
    print(f"   ROUGE-L: {rouge_results['rougeL']:.4f}")
except Exception as e:
    print(f"   ROUGE failed: {e}")

# =====================================================================
# 2. BLEU SCORE
# =====================================================================

print("\n[2/4] Computing BLEU score...")
try:
    from sacrebleu import corpus_bleu
    # BLEU expects list of references (each ref is a list)
    references = [[gold] for gold in gold_strings]
    bleu_result = corpus_bleu(pred_strings, references)
    print(f"‚úì BLEU Score: {bleu_result.score:.4f}")
    print(f"   BLEU-1: {bleu_result.precisions[0]:.2f}, "
          f"BLEU-2: {bleu_result.precisions[1]:.2f}, "
          f"BLEU-3: {bleu_result.precisions[2]:.2f}, "
          f"BLEU-4: {bleu_result.precisions[3]:.2f}")
except ImportError:
    print("   Installing sacrebleu...")
    import subprocess
    subprocess.run(["pip", "install", "-q", "sacrebleu"], check=True)
    from sacrebleu import corpus_bleu
    references = [[gold] for gold in gold_strings]
    bleu_result = corpus_bleu(pred_strings, references)
    print(f"‚úì BLEU Score: {bleu_result.score:.4f}")
except Exception as e:
    print(f"   BLEU failed: {e}")

# =====================================================================
# 3. METEOR SCORE
# =====================================================================

print("\n[3/4] Computing METEOR score...")
try:
    meteor = evaluate.load('meteor')
    meteor_result = meteor.compute(predictions=pred_strings, references=gold_strings)
    print(f"‚úì METEOR Score: {meteor_result['meteor']:.4f}")
except Exception as e:
    print(f"   METEOR failed: {e}")

# =====================================================================
# 4. BERTScore
# =====================================================================

print("\n[4/4] Computing BERTScore (this may take a minute)...")
try:
    bertscore = evaluate.load('bertscore')
    bert_results = bertscore.compute(
        predictions=pred_strings,
        references=gold_strings,
        lang="en",
        model_type="microsoft/deberta-base-mnli",  # Faster than roberta-large
        batch_size=16
    )
    
    # Average scores
    avg_precision = sum(bert_results['precision']) / len(bert_results['precision'])
    avg_recall = sum(bert_results['recall']) / len(bert_results['recall'])
    avg_f1 = sum(bert_results['f1']) / len(bert_results['f1'])
    
    print(f"‚úì BERTScore:")
    print(f"   Precision: {avg_precision:.4f}")
    print(f"   Recall: {avg_recall:.4f}")
    print(f"   F1: {avg_f1:.4f}")
except Exception as e:
    print(f"   BERTScore failed: {e}")
    print("   (This is optional - task-specific metrics are more important)")

print("\n" + "="*80)
print("‚úÖ CELL 9 COMPLETE - Proceed to Cell 10")
print("="*80)


GENERATION QUALITY METRICS

[1/4] Computing ROUGE scores...


Downloading builder script: 0.00B [00:00, ?B/s]

‚úì ROUGE Scores:
   ROUGE-1: 0.6280
   ROUGE-2: 0.4777
   ROUGE-L: 0.6053

[2/4] Computing BLEU score...
   Installing sacrebleu...
   ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 100.8/100.8 kB 3.9 MB/s eta 0:00:00
‚úì BLEU Score: 35.8567

[3/4] Computing METEOR score...


Downloading builder script: 0.00B [00:00, ?B/s]

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


‚úì METEOR Score: 0.6783

[4/4] Computing BERTScore (this may take a minute)...


Downloading builder script: 0.00B [00:00, ?B/s]

   BERTScore failed: To be able to use evaluate-metric/bertscore, you need to install the following dependencies['bert_score'] using 'pip install bert_score' for instance'
   (This is optional - task-specific metrics are more important)

‚úÖ CELL 9 COMPLETE - Proceed to Cell 10


In [12]:
# =======================================================================
# CELL 9.5: DIVERSITY & MODE COLLAPSE DETECTION
# =======================================================================

print("="*80)
print("DIVERSITY & MODE COLLAPSE DETECTION")
print("="*80)

from collections import Counter

# =====================================================================
# 1. OUTPUT DIVERSITY
# =====================================================================

print("\n[1/3] Analyzing output diversity...")

all_outputs = [pred['gen_output'] for pred in test_predictions]
unique_outputs = len(set(all_outputs))
total_outputs = len(all_outputs)
diversity_rate = unique_outputs / total_outputs

print(f"‚úì Output Diversity:")
print(f"   Unique predictions: {unique_outputs}/{total_outputs} ({diversity_rate*100:.1f}%)")

# Check for mode collapse
output_counts = Counter(all_outputs)
most_common = output_counts.most_common(3)

print(f"\n   Most common outputs:")
for i, (output, count) in enumerate(most_common, 1):
    pct = count / total_outputs * 100
    status = "‚ùå MODE COLLAPSE!" if count > total_outputs * 0.5 else "‚ö†Ô∏è" if count > total_outputs * 0.1 else "‚úì"
    print(f"   {i}. [{status}] {count}/{total_outputs} ({pct:.1f}%)")
    print(f"      Output: {output[:100]}...")

# =====================================================================
# 2. ASPECT DIVERSITY
# =====================================================================

print("\n[2/3] Analyzing aspect diversity...")

all_pred_aspects = []
for pred in test_predictions:
    pred_pairs = parse_7field_output(pred['gen_output'])
    for aspect, sentiment in pred_pairs:
        all_pred_aspects.append(aspect)

unique_aspects = len(set(all_pred_aspects))
total_aspects = len(all_pred_aspects)
aspect_diversity = unique_aspects / total_aspects if total_aspects > 0 else 0

print(f"‚úì Aspect Diversity:")
print(f"   Unique aspects: {unique_aspects}/{total_aspects} ({aspect_diversity*100:.1f}%)")

# Most common aspects
aspect_counts = Counter(all_pred_aspects)
most_common_aspects = aspect_counts.most_common(10)

print(f"\n   Top 10 most predicted aspects:")
for i, (aspect, count) in enumerate(most_common_aspects, 1):
    pct = count / total_aspects * 100
    status = "‚ùå" if count > total_aspects * 0.5 else "‚ö†Ô∏è" if count > total_aspects * 0.2 else "‚úì"
    print(f"   {i:2d}. [{status}] {aspect:<30} : {count:3d} ({pct:4.1f}%)")

# =====================================================================
# 3. MODE COLLAPSE DETECTION
# =====================================================================

print("\n[3/3] Mode collapse detection...")

# Thresholds
single_output_threshold = 0.5  # If >50% same output = collapse
single_aspect_threshold = 0.5  # If >50% same aspect = collapse

mode_collapse_detected = False

# Check output collapse
if most_common[0][1] / total_outputs > single_output_threshold:
    print(f"‚ùå MODE COLLAPSE DETECTED - OUTPUT LEVEL")
    print(f"   {most_common[0][1]}/{total_outputs} ({most_common[0][1]/total_outputs*100:.1f}%) predictions are identical!")
    mode_collapse_detected = True
else:
    print(f"‚úÖ No output-level mode collapse")

# Check aspect collapse
if most_common_aspects[0][1] / total_aspects > single_aspect_threshold:
    print(f"‚ùå MODE COLLAPSE DETECTED - ASPECT LEVEL")
    print(f"   '{most_common_aspects[0][0]}' appears in {most_common_aspects[0][1]/total_aspects*100:.1f}% of outputs!")
    mode_collapse_detected = True
else:
    print(f"‚úÖ No aspect-level mode collapse")

# Overall assessment
print(f"\n{'='*80}")
if mode_collapse_detected:
    print("‚ö†Ô∏è  WARNING: MODE COLLAPSE DETECTED")
    print("   Model is producing repetitive outputs")
    print("   This indicates training failure or overfitting")
elif diversity_rate < 0.5:
    print("‚ö†Ô∏è  WARNING: LOW DIVERSITY")
    print(f"   Only {diversity_rate*100:.1f}% unique outputs")
    print("   Model may be underfitting or needs more training data")
elif diversity_rate < 0.8:
    print("‚úì ACCEPTABLE DIVERSITY")
    print(f"   {diversity_rate*100:.1f}% unique outputs (acceptable for implicit ABSA)")
else:
    print("‚úÖ EXCELLENT DIVERSITY")
    print(f"   {diversity_rate*100:.1f}% unique outputs (high variability)")

print(f"{'='*80}")

# Store metrics
diversity_metrics = {
    "unique_predictions": unique_outputs,
    "total_predictions": total_outputs,
    "diversity_rate": diversity_rate,
    "unique_aspects": unique_aspects,
    "total_aspects_predicted": total_aspects,
    "aspect_diversity": aspect_diversity,
    "mode_collapse_detected": mode_collapse_detected,
    "most_common_output_freq": most_common[0][1] / total_outputs,
    "most_common_aspect": most_common_aspects[0][0],
    "most_common_aspect_freq": most_common_aspects[0][1] / total_aspects
}

print("\n" + "="*80)
print("‚úÖ CELL 9.5 COMPLETE - Proceed to Cell 10")
print("="*80)


DIVERSITY & MODE COLLAPSE DETECTION

[1/3] Analyzing output diversity...
‚úì Output Diversity:
   Unique predictions: 452/453 (99.8%)

   Most common outputs:
   1. [‚úì] 2/453 (0.4%)
      Output: atmosphere | positive | atmosphere | Explicit positive: 'atmosphere cozy' cozy. | FALSE | FALSE | FA...
   2. [‚úì] 1/453 (0.2%)
      Output: election_fake_news | negative | Everything is fake news unless my candidate wins | explicitly uses '...
   3. [‚úì] 1/453 (0.2%)
      Output: interface | positive | interface | Explicit positive: 'interface gesture natural' intuitive. | FALSE...

[2/3] Analyzing aspect diversity...
‚úì Aspect Diversity:
   Unique aspects: 256/546 (46.9%)

   Top 10 most predicted aspects:
    1. [‚úì] order_accuracy                 :  87 (15.9%)
    2. [‚úì] platform_accessibility         :  66 (12.1%)
    3. [‚úì] food_preparation               :  13 ( 2.4%)
    4. [‚úì] battery_life                   :  12 ( 2.2%)
    5. [‚úì] political_strategy             :  11 (

In [13]:
# =======================================================================
# CELL 10: ADVANCED METRICS
# =======================================================================

print("="*80)
print("ADVANCED EVALUATION METRICS")
print("="*80)

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# =====================================================================
# 1. SEMANTIC HALLUCINATION DETECTION
# =====================================================================

print("\n[1/3] Loading semantic similarity model...")
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
print("‚úì Semantic model loaded")

print("\n[1/3] Analyzing hallucinations with semantic similarity...")

total_generated_aspects = 0
explicit_in_text = 0
implicit_but_valid = 0
true_hallucinations = 0

SIMILARITY_THRESHOLD = 0.65

for item in test_predictions:
    # Extract actual text
    actual_text = extract_text_from_prompt(item['input'])
    pred_text = item['gen_output']
    
    # Parse predicted aspects
    pred_pairs = parse_7field_output(pred_text)
    
    for aspect, sentiment in pred_pairs:
        total_generated_aspects += 1
        
        # Check if aspect keywords appear in text (explicit)
        if check_aspect_in_text(aspect, actual_text):
            explicit_in_text += 1
        else:
            # Not explicit - check semantic similarity
            aspect_embedding = semantic_model.encode([aspect])
            text_embedding = semantic_model.encode([actual_text])
            similarity = cosine_similarity(aspect_embedding, text_embedding)[0][0]
            
            if similarity >= SIMILARITY_THRESHOLD:
                implicit_but_valid += 1
            else:
                true_hallucinations += 1

semantic_hallucination_rate = true_hallucinations / total_generated_aspects if total_generated_aspects > 0 else 0
strict_hallucination_rate = (implicit_but_valid + true_hallucinations) / total_generated_aspects if total_generated_aspects > 0 else 0

print(f"\n‚úì Semantic Hallucination Analysis:")
print(f"   Total generated aspects: {total_generated_aspects}")
print(f"   Explicit in text: {explicit_in_text} ({explicit_in_text/total_generated_aspects*100:.1f}%)")
print(f"   Implicit but semantically valid: {implicit_but_valid} ({implicit_but_valid/total_generated_aspects*100:.1f}%)")
print(f"   True hallucinations: {true_hallucinations} ({true_hallucinations/total_generated_aspects*100:.1f}%)")
print(f"   Semantic Hallucination Rate: {semantic_hallucination_rate:.4f} ({semantic_hallucination_rate*100:.1f}%)")
print(f"   Strict Hallucination Rate: {strict_hallucination_rate:.4f} ({strict_hallucination_rate*100:.1f}%)")

# =====================================================================
# 2. COVERAGE ANALYSIS
# =====================================================================

print("\n[2/3] Computing coverage...")

total_gold_aspects = 0
covered_aspects = 0
semantic_covered = 0

for item in test_predictions:
    gold_text = item['gold_target']
    pred_text = item['gen_output']
    
    gold_pairs = parse_7field_output(gold_text)
    pred_pairs = parse_7field_output(pred_text)
    
    gold_aspects = set(a for a, s in gold_pairs)
    pred_aspects = set(a for a, s in pred_pairs)
    
    total_gold_aspects += len(gold_aspects)
    
    # Exact coverage
    covered_aspects += len(gold_aspects & pred_aspects)
    
    # Semantic coverage (for missed aspects)
    missed = gold_aspects - pred_aspects
    if missed:
        for gold_asp in missed:
            # Check if any predicted aspect is semantically similar
            gold_emb = semantic_model.encode([gold_asp])
            for pred_asp in pred_aspects:
                pred_emb = semantic_model.encode([pred_asp])
                sim = cosine_similarity(gold_emb, pred_emb)[0][0]
                if sim >= 0.7:  # Higher threshold for coverage
                    semantic_covered += 1
                    break

exact_coverage = covered_aspects / total_gold_aspects if total_gold_aspects > 0 else 0
semantic_coverage_rate = (covered_aspects + semantic_covered) / total_gold_aspects if total_gold_aspects > 0 else 0

print(f"‚úì Coverage Analysis:")
print(f"   Total gold aspects: {total_gold_aspects}")
print(f"   Exact matches: {covered_aspects} ({exact_coverage*100:.1f}%)")
print(f"   Semantic matches: {semantic_covered}")
print(f"   Semantic Coverage: {semantic_coverage_rate:.4f} ({semantic_coverage_rate*100:.1f}%)")
print(f"   Exact Coverage: {exact_coverage:.4f} ({exact_coverage*100:.1f}%)")

# =====================================================================
# 3. FORMAT ADHERENCE
# =====================================================================

print("\n[3/3] Checking format adherence...")

valid_format = 0
malformed = 0
empty_outputs = 0

for item in test_predictions:
    gen_output = item['gen_output'].strip()
    
    if not gen_output:
        empty_outputs += 1
        continue
    
    # Split by semicolon
    entries = re.split(r'\s*;\s*', gen_output)
    
    all_valid = True
    for entry in entries:
        fields = entry.split('|')
        
        # Must have at least 2 fields (aspect | sentiment)
        if len(fields) < 2:
            all_valid = False
            break
        
        # Check sentiment is valid
        if len(fields) >= 2:
            sent = fields[1].strip().lower()
            if sent not in ['positive', 'negative', 'neutral']:
                all_valid = False
                break
    
    if all_valid:
        valid_format += 1
    else:
        malformed += 1

format_adherence = valid_format / len(test_predictions) if test_predictions else 0

print(f"‚úì Format Adherence:")
print(f"   Valid format: {valid_format}/{len(test_predictions)} ({format_adherence*100:.1f}%)")
print(f"   Malformed outputs: {malformed} ({malformed/len(test_predictions)*100:.1f}%)")
print(f"   Empty outputs: {empty_outputs} ({empty_outputs/len(test_predictions)*100:.1f}%)")

print("\n" + "="*80)
print("‚úÖ CELL 10 COMPLETE - Proceed to Cell 11")
print("="*80)


ADVANCED EVALUATION METRICS

[1/3] Loading semantic similarity model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

‚úì Semantic model loaded

[1/3] Analyzing hallucinations with semantic similarity...

‚úì Semantic Hallucination Analysis:
   Total generated aspects: 546
   Explicit in text: 224 (41.0%)
   Implicit but semantically valid: 0 (0.0%)
   True hallucinations: 322 (59.0%)
   Semantic Hallucination Rate: 0.5897 (59.0%)
   Strict Hallucination Rate: 0.5897 (59.0%)

[2/3] Computing coverage...
‚úì Coverage Analysis:
   Total gold aspects: 591
   Exact matches: 185 (31.3%)
   Semantic matches: 33
   Semantic Coverage: 0.3689 (36.9%)
   Exact Coverage: 0.3130 (31.3%)

[3/3] Checking format adherence...
‚úì Format Adherence:
   Valid format: 453/453 (100.0%)
   Malformed outputs: 0 (0.0%)
   Empty outputs: 0 (0.0%)

‚úÖ CELL 10 COMPLETE - Proceed to Cell 11


In [14]:
# =======================================================================
# CELL 11: SHOW PREDICTION EXAMPLES
# =======================================================================

print("="*80)
print("PREDICTION EXAMPLES (First 10)")
print("="*80)

for i in range(min(10, len(test_predictions))):
    item = test_predictions[i]
    actual_text = extract_text_from_prompt(item['input'])
    
    print(f"\n{'='*80}")
    print(f"Example {i+1}")
    print(f"{'='*80}")
    print(f"TEXT:\n{actual_text[:200]}...")
    print(f"\nGOLD:\n{item['gold_target']}")
    print(f"\nPREDICTED:\n{item['gen_output']}")
    
    # Quick match check
    gold_pairs = parse_7field_output(item['gold_target'])
    pred_pairs = parse_7field_output(item['gen_output'])
    
    if gold_pairs == pred_pairs:
        match = "‚úÖ EXACT MATCH"
    else:
        matched = len(gold_pairs & pred_pairs)
        total = len(gold_pairs)
        match = f"‚ö†Ô∏è  Partial: {matched}/{total} matched"
    
    print(f"\nMATCH: {match}")

print("\n" + "="*80)
print("‚úÖ CELL 11 COMPLETE - Proceed to Cell 12")
print("="*80)


PREDICTION EXAMPLES (First 10)

Example 1
TEXT:
Everything is fake news unless my candidate wins. It's always the same thing with you Repubs: it's rigged, unless I win. Face it, the only reason Trump won is that people forgot what a disaster his pr...

GOLD:
election_fraud | negative | it's rigged, unless I win | explicitly uses 'it's rigged, unless I win' to express negative sentiment about election_fraud claims by mocking Republican hypocrisy. | FALSE | FALSE | FALSE

PREDICTED:
election_fake_news | negative | Everything is fake news unless my candidate wins | explicitly uses 'Everything is fake news unless my candidate wins' to express negative sentiment about election_fake_news. | FALSE | FALSE | FALSE

MATCH: ‚ö†Ô∏è  Partial: 0/1 matched

Example 2
TEXT:
tablet drawing app, interface gesture swipe natural flow...

GOLD:
interface | positive | interface | Explicit positive: 'interface gesture natural' intuitive. | FALSE | FALSE | FALSE

PREDICTED:
interface | positive | interface |

In [15]:
# =======================================================================
# CELL 12: FINAL COMPREHENSIVE SUMMARY REPORT
# =======================================================================

import json

print("="*80)
print("FINAL COMPREHENSIVE EVALUATION SUMMARY")
print("="*80)

# =====================================================================
# BUILD COMPREHENSIVE SUMMARY
# =====================================================================

summary_report = {
    "model": "FLAN-T5-Base + DAPT + Few-Shot Baseline",
    "dataset": {
        "train_samples": len(train_ds),
        "val_samples": len(val_ds),
        "test_samples": len(test_predictions),
        "implicit_rate": 0.619,
        "total_aspects": 2750
    },
    "training_time_minutes": training_time / 60 if 'training_time' in globals() else "N/A",
    "training_loss": train_result.metrics.get('train_loss', 'N/A') if 'train_result' in globals() else "N/A",
    
    # Primary ABSA Metrics
    "task_specific_metrics": {
        "aspect_sentiment_f1": round(tuple_f1, 4),
        "aspect_sentiment_precision": round(tuple_p, 4),
        "aspect_sentiment_recall": round(tuple_r, 4),
        "aspect_only_f1": round(aspect_f1, 4),
        "aspect_only_precision": round(aspect_p, 4),
        "aspect_only_recall": round(aspect_r, 4),
        "exact_match_accuracy": round(exact_acc, 4),
        "sentiment_accuracy": round(sent_acc, 4),
        "true_positives": TP,
        "false_positives": FP,
        "false_negatives": FN
    },
    
    # Generation Quality
    "generation_quality": {
        "rouge_1": round(rouge_results['rouge1'], 4) if 'rouge_results' in globals() else "N/A",
        "rouge_2": round(rouge_results['rouge2'], 4) if 'rouge_results' in globals() else "N/A",
        "rouge_l": round(rouge_results['rougeL'], 4) if 'rouge_results' in globals() else "N/A",
        "bleu": round(bleu_result.score, 4) if 'bleu_result' in globals() else "N/A",
        "meteor": round(meteor_result['meteor'], 4) if 'meteor_result' in globals() else "N/A",
        "bertscore_precision": round(avg_precision, 4) if 'avg_precision' in globals() else "N/A",
        "bertscore_recall": round(avg_recall, 4) if 'avg_recall' in globals() else "N/A",
        "bertscore_f1": round(avg_f1, 4) if 'avg_f1' in globals() else "N/A"
    },
    
    # Advanced Metrics
    "advanced_metrics": {
        "semantic_hallucination_rate": round(semantic_hallucination_rate, 4),
        "strict_hallucination_rate": round(strict_hallucination_rate, 4),
        "semantic_coverage": round(semantic_coverage_rate, 4),
        "exact_coverage": round(exact_coverage, 4),
        "format_adherence": round(format_adherence, 4),
        "total_generated_aspects": total_generated_aspects,
        "explicit_in_text": explicit_in_text,
        "implicit_but_valid": implicit_but_valid,
        "true_hallucinations": true_hallucinations
    },
    
    # Diversity Metrics
    "diversity_metrics": {
        "unique_predictions": diversity_metrics['unique_predictions'],
        "total_predictions": diversity_metrics['total_predictions'],
        "diversity_rate": round(diversity_metrics['diversity_rate'], 4),
        "unique_aspects": diversity_metrics['unique_aspects'],
        "total_aspects_predicted": diversity_metrics['total_aspects_predicted'],
        "aspect_diversity": round(diversity_metrics['aspect_diversity'], 4),
        "mode_collapse_detected": diversity_metrics['mode_collapse_detected'],
        "most_common_output_freq": round(diversity_metrics['most_common_output_freq'], 4),
        "most_common_aspect": diversity_metrics['most_common_aspect'],
        "most_common_aspect_freq": round(diversity_metrics['most_common_aspect_freq'], 4)
    }
}

# =====================================================================
# DISPLAY RESULTS
# =====================================================================

print(f"\n{'='*80}")
print("MODEL INFORMATION")
print(f"{'='*80}")
print(f"Model: {summary_report['model']}")
print(f"Dataset: {summary_report['dataset']['test_samples']} test samples")
print(f"Implicit Rate: {summary_report['dataset']['implicit_rate']*100:.1f}%")
if isinstance(summary_report['training_time_minutes'], float):
    print(f"Training Time: {summary_report['training_time_minutes']:.1f} minutes ({summary_report['training_time_minutes']/60:.2f} hours)")
if summary_report['training_loss'] != "N/A":
    print(f"Final Train Loss: {summary_report['training_loss']:.4f}")

print(f"\n{'='*80}")
print("1Ô∏è‚É£  PRIMARY ABSA METRICS (Most Important)")
print(f"{'='*80}")
print(f"Aspect + Sentiment Matching:")
print(f"  ‚Ä¢ F1 Score:     {tuple_f1:.4f} ({tuple_f1*100:.1f}%)")
print(f"  ‚Ä¢ Precision:    {tuple_p:.4f} ({tuple_p*100:.1f}%)")
print(f"  ‚Ä¢ Recall:       {tuple_r:.4f} ({tuple_r*100:.1f}%)")
print(f"  ‚Ä¢ TP/FP/FN:     {TP}/{FP}/{FN}")

print(f"\nAspect-Only (Ignore Sentiment):")
print(f"  ‚Ä¢ F1 Score:     {aspect_f1:.4f} ({aspect_f1*100:.1f}%)")
print(f"  ‚Ä¢ Precision:    {aspect_p:.4f} ({aspect_p*100:.1f}%)")
print(f"  ‚Ä¢ Recall:       {aspect_r:.4f} ({aspect_r*100:.1f}%)")

print(f"\nOther Task Metrics:")
print(f"  ‚Ä¢ Exact Match Accuracy:     {exact_acc:.4f} ({exact_acc*100:.1f}%)")
print(f"  ‚Ä¢ Sentiment Accuracy:       {sent_acc:.4f} ({sent_acc*100:.1f}%)")

print(f"\n{'='*80}")
print("2Ô∏è‚É£  GENERATION QUALITY METRICS")
print(f"{'='*80}")
if 'rouge_results' in globals():
    print(f"ROUGE Scores:")
    print(f"  ‚Ä¢ ROUGE-1:      {rouge_results['rouge1']:.4f}")
    print(f"  ‚Ä¢ ROUGE-2:      {rouge_results['rouge2']:.4f}")
    print(f"  ‚Ä¢ ROUGE-L:      {rouge_results['rougeL']:.4f}")
if 'bleu_result' in globals():
    print(f"\nBLEU Score:       {bleu_result.score:.4f}")
if 'meteor_result' in globals():
    print(f"METEOR Score:     {meteor_result['meteor']:.4f}")
if 'avg_f1' in globals():
    print(f"\nBERTScore:")
    print(f"  ‚Ä¢ Precision:    {avg_precision:.4f}")
    print(f"  ‚Ä¢ Recall:       {avg_recall:.4f}")
    print(f"  ‚Ä¢ F1:           {avg_f1:.4f}")

print(f"\n{'='*80}")
print("3Ô∏è‚É£  ADVANCED SEMANTIC METRICS")
print(f"{'='*80}")
print(f"Hallucination Analysis:")
print(f"  ‚Ä¢ Semantic Hallucination Rate:  {semantic_hallucination_rate:.4f} ({semantic_hallucination_rate*100:.1f}%)")
print(f"  ‚Ä¢ Strict Hallucination Rate:    {strict_hallucination_rate:.4f} ({strict_hallucination_rate*100:.1f}%)")
print(f"  ‚Ä¢ Generated Aspects:            {total_generated_aspects}")
print(f"    - Explicit in text:           {explicit_in_text} ({explicit_in_text/total_generated_aspects*100:.1f}%)")
print(f"    - Implicit but valid:         {implicit_but_valid} ({implicit_but_valid/total_generated_aspects*100:.1f}%)")
print(f"    - True hallucinations:        {true_hallucinations} ({true_hallucinations/total_generated_aspects*100:.1f}%)")

print(f"\nCoverage Analysis:")
print(f"  ‚Ä¢ Semantic Coverage:    {semantic_coverage_rate:.4f} ({semantic_coverage_rate*100:.1f}%)")
print(f"  ‚Ä¢ Exact Coverage:       {exact_coverage:.4f} ({exact_coverage*100:.1f}%)")

print(f"\nFormat Adherence:")
print(f"  ‚Ä¢ Valid Format:         {format_adherence:.4f} ({format_adherence*100:.1f}%)")

print(f"\n{'='*80}")
print("4Ô∏è‚É£  DIVERSITY METRICS (Mode Collapse Detection)")
print(f"{'='*80}")
print(f"Output Diversity:")
print(f"  ‚Ä¢ Unique Predictions:   {diversity_metrics['unique_predictions']}/{diversity_metrics['total_predictions']} ({diversity_metrics['diversity_rate']*100:.1f}%)")
print(f"  ‚Ä¢ Most Common Output:   {diversity_metrics['most_common_output_freq']*100:.1f}% of outputs")

print(f"\nAspect Diversity:")
print(f"  ‚Ä¢ Unique Aspects:       {diversity_metrics['unique_aspects']}/{diversity_metrics['total_aspects_predicted']} ({diversity_metrics['aspect_diversity']*100:.1f}%)")
print(f"  ‚Ä¢ Most Common Aspect:   '{diversity_metrics['most_common_aspect']}' ({diversity_metrics['most_common_aspect_freq']*100:.1f}%)")

print(f"\nMode Collapse Status:")
if diversity_metrics['mode_collapse_detected']:
    print(f"  ‚ùå MODE COLLAPSE DETECTED - Model produces repetitive outputs!")
elif diversity_metrics['diversity_rate'] < 0.5:
    print(f"  ‚ö†Ô∏è  LOW DIVERSITY - Model needs more training data or longer training")
elif diversity_metrics['diversity_rate'] < 0.8:
    print(f"  ‚úì ACCEPTABLE DIVERSITY - Normal for implicit ABSA")
else:
    print(f"  ‚úÖ EXCELLENT DIVERSITY - High output variability")

# =====================================================================
# OVERALL ASSESSMENT
# =====================================================================

print(f"\n{'='*80}")
print("üìä OVERALL ASSESSMENT")
print(f"{'='*80}")

# Grade the model
score_components = []

# Task F1 (40%)
if tuple_f1 >= 0.7:
    task_grade = "Excellent"
elif tuple_f1 >= 0.5:
    task_grade = "Good"
elif tuple_f1 >= 0.3:
    task_grade = "Fair"
else:
    task_grade = "Poor"
score_components.append(("Task Performance", task_grade, tuple_f1))

# Hallucination (20%)
if semantic_hallucination_rate <= 0.1:
    hall_grade = "Excellent"
elif semantic_hallucination_rate <= 0.2:
    hall_grade = "Good"
elif semantic_hallucination_rate <= 0.3:
    hall_grade = "Fair"
else:
    hall_grade = "Poor"
score_components.append(("Hallucination Control", hall_grade, 1 - semantic_hallucination_rate))

# Coverage (20%)
if semantic_coverage_rate >= 0.8:
    cov_grade = "Excellent"
elif semantic_coverage_rate >= 0.6:
    cov_grade = "Good"
elif semantic_coverage_rate >= 0.4:
    cov_grade = "Fair"
else:
    cov_grade = "Poor"
score_components.append(("Coverage", cov_grade, semantic_coverage_rate))

# Diversity (20%)
if diversity_metrics['diversity_rate'] >= 0.8:
    div_grade = "Excellent"
elif diversity_metrics['diversity_rate'] >= 0.5:
    div_grade = "Good"
elif diversity_metrics['diversity_rate'] >= 0.3:
    div_grade = "Fair"
else:
    div_grade = "Poor"
score_components.append(("Diversity", div_grade, diversity_metrics['diversity_rate']))

print(f"\nPerformance Breakdown:")
for component, grade, score in score_components:
    print(f"  ‚Ä¢ {component:<25} {grade:<12} ({score:.4f})")

# Overall recommendation
print(f"\n{'='*80}")
if tuple_f1 >= 0.5 and not diversity_metrics['mode_collapse_detected'] and semantic_hallucination_rate <= 0.2:
    print("‚úÖ MODEL STATUS: READY FOR RESEARCH")
    print("   Model performs well on implicit ABSA tasks")
elif tuple_f1 >= 0.3 and not diversity_metrics['mode_collapse_detected']:
    print("‚ö†Ô∏è  MODEL STATUS: NEEDS IMPROVEMENT")
    print("   Acceptable baseline but requires optimization")
else:
    print("‚ùå MODEL STATUS: REQUIRES SIGNIFICANT WORK")
    print("   Performance below research standards")

# =====================================================================
# SAVE ARTIFACTS
# =====================================================================

print(f"\n{'='*80}")
print("üíæ SAVING EVALUATION ARTIFACTS")
print(f"{'='*80}")

# Save JSON summary
summary_file = OUTPUT_MODEL_DIR / "evaluation_summary_complete.json"
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary_report, f, indent=2, ensure_ascii=False)
print(f"‚úì JSON summary saved: {summary_file}")

# Save human-readable report
report_file = OUTPUT_MODEL_DIR / "evaluation_report_final.txt"
with open(report_file, 'w', encoding='utf-8') as f:
    f.write("="*80 + "\n")
    f.write("ABSA MODEL EVALUATION REPORT (COMPLETE)\n")
    f.write("="*80 + "\n\n")
    
    f.write(f"Model: {summary_report['model']}\n")
    f.write(f"Test Samples: {summary_report['dataset']['test_samples']}\n")
    f.write(f"Implicit Rate: {summary_report['dataset']['implicit_rate']*100:.1f}%\n\n")
    
    f.write("PRIMARY METRICS:\n")
    f.write(f"  Aspect+Sentiment F1:  {tuple_f1:.4f}\n")
    f.write(f"  Aspect-Only F1:       {aspect_f1:.4f}\n")
    f.write(f"  Exact Match Acc:      {exact_acc:.4f}\n")
    f.write(f"  Sentiment Acc:        {sent_acc:.4f}\n\n")
    
    if 'rouge_results' in globals():
        f.write("GENERATION QUALITY:\n")
        f.write(f"  ROUGE-L:              {rouge_results['rougeL']:.4f}\n")
        if 'bleu_result' in globals():
            f.write(f"  BLEU:                 {bleu_result.score:.4f}\n")
        if 'meteor_result' in globals():
            f.write(f"  METEOR:               {meteor_result['meteor']:.4f}\n")
        f.write("\n")
    
    f.write("ADVANCED METRICS:\n")
    f.write(f"  Hallucination Rate:   {semantic_hallucination_rate:.4f}\n")
    f.write(f"  Coverage:             {semantic_coverage_rate:.4f}\n")
    f.write(f"  Format Adherence:     {format_adherence:.4f}\n\n")
    
    f.write("DIVERSITY METRICS:\n")
    f.write(f"  Unique Predictions:   {diversity_metrics['diversity_rate']:.4f}\n")
    f.write(f"  Unique Aspects:       {diversity_metrics['aspect_diversity']:.4f}\n")
    f.write(f"  Mode Collapse:        {'YES' if diversity_metrics['mode_collapse_detected'] else 'NO'}\n\n")
    
    f.write("="*80 + "\n")
    f.write("RECOMMENDATIONS:\n")
    f.write("="*80 + "\n")
    
    if tuple_f1 < 0.4:
        f.write("- Increase training epochs (current: 7, try: 10-15)\n")
        f.write("- Add more few-shot examples\n")
    if semantic_hallucination_rate > 0.2:
        f.write("- Reduce learning rate for stability\n")
        f.write("- Add regularization (weight decay)\n")
    if diversity_metrics['mode_collapse_detected']:
        f.write("- CRITICAL: Model collapsed! Restart training with:\n")
        f.write("  * Lower learning rate\n")
        f.write("  * More diverse training data\n")
        f.write("  * Gradient clipping\n")
    if semantic_coverage_rate < 0.6:
        f.write("- Increase training data coverage\n")
        f.write("- Train for more epochs\n")
    
    if tuple_f1 >= 0.5 and not diversity_metrics['mode_collapse_detected']:
        f.write("\n‚úÖ Model is ready for research use!\n")
    
print(f"‚úì Report saved: {report_file}")

print(f"\n{'='*80}")
print("üéâ ALL EVALUATION COMPLETE!")
print(f"{'='*80}")
print(f"\nFiles saved in: {OUTPUT_MODEL_DIR}")
print("  1. test_predictions.jsonl")
print("  2. evaluation_summary_complete.json")
print("  3. evaluation_report_final.txt")
print(f"{'='*80}\n")

print("‚úÖ CELL 12 COMPLETE - EVALUATION FINISHED!")


FINAL COMPREHENSIVE EVALUATION SUMMARY

MODEL INFORMATION
Model: FLAN-T5-Base + DAPT + Few-Shot Baseline
Dataset: 453 test samples
Implicit Rate: 61.9%
Training Time: 382.1 minutes (6.37 hours)
Final Train Loss: 1.0359

1Ô∏è‚É£  PRIMARY ABSA METRICS (Most Important)
Aspect + Sentiment Matching:
  ‚Ä¢ F1 Score:     0.3096 (31.0%)
  ‚Ä¢ Precision:    0.3223 (32.2%)
  ‚Ä¢ Recall:       0.2978 (29.8%)
  ‚Ä¢ TP/FP/FN:     176/370/415

Aspect-Only (Ignore Sentiment):
  ‚Ä¢ F1 Score:     0.3254 (32.5%)
  ‚Ä¢ Precision:    0.3388 (33.9%)
  ‚Ä¢ Recall:       0.3130 (31.3%)

Other Task Metrics:
  ‚Ä¢ Exact Match Accuracy:     0.2141 (21.4%)
  ‚Ä¢ Sentiment Accuracy:       0.9514 (95.1%)

2Ô∏è‚É£  GENERATION QUALITY METRICS
ROUGE Scores:
  ‚Ä¢ ROUGE-1:      0.6280
  ‚Ä¢ ROUGE-2:      0.4777
  ‚Ä¢ ROUGE-L:      0.6053

BLEU Score:       35.8567
METEOR Score:     0.6783

3Ô∏è‚É£  ADVANCED SEMANTIC METRICS
Hallucination Analysis:
  ‚Ä¢ Semantic Hallucination Rate:  0.5897 (59.0%)
  ‚Ä¢ Strict Halluc