## Title 

# Track A – Subtask 1 (English, Valence–Arousal Regression)

This notebook trains and evaluates Transformer-based regressors for
Subtask 1 on **both laptop and restaurant domains**, using local JSONL
files from the official DimABSA 2026 dataset.

The steps:
1. Load local JSONL files for a given domain
2. Convert to a unified DataFrame
3. Split train into train / dev (10% dev)
4. Train several transformer models
5. Select the best model (lowest RMSE_VA)
6. Generate competition-format JSONL predictions for the dev set
7. Repeat for laptop and restaurant domains


## Imports & path setup

In [None]:
import os, sys, json
import pandas as pd
from sklearn.model_selection import train_test_split

import torch

# Make src/ importable
SRC_DIR = os.path.join("..", "src")
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

from data_loader import load_jsonl, jsonl_to_df
from train import train_model
from inference import generate_submission

## Config 

In [None]:
# Domains we want to run
DOMAINS = ["laptop", "restaurant"]

# Base directories (relative to this notebook)
DATA_DIR = os.path.join("..", "data")
SUBMISSION_DIR = os.path.join("..", "submissions")

os.makedirs(SUBMISSION_DIR, exist_ok=True)

# Models to try for each domain
MODELS_TO_TRY = [
    {
        "name": "bert-base-uncased",
        "lr": 3e-5,
        "epochs": 20,
        "desc": "BERT Base (uncased)"
    },
    {
        "name": "j-hartmann/emotion-english-distilroberta-base",
        "lr": 3e-5,
        "epochs": 20,
        "desc": "Emotion-tuned DistilRoBERTa"
    }
]

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE


## Helper to run all models for one domain

In [None]:
from transformers import AutoTokenizer
from model import TransformerVARegressor
from utils import get_predictions, evaluate_predictions

def run_models_for_domain(domain: str):
    print("=" * 70)
    print(f"Running Subtask 1 – Domain: {domain.upper()}")
    print("=" * 70)

    # ---------- 1. Load local JSONL ----------
    train_path = os.path.join(DATA_DIR, f"eng_{domain}_train_alltasks.jsonl")
    dev_task1_path = os.path.join(DATA_DIR, f"eng_{domain}_dev_task1.jsonl")

    train_raw = load_jsonl(train_path)
    dev_raw = load_jsonl(dev_task1_path)

    train_df = jsonl_to_df(train_raw)
    dev_task1_df = jsonl_to_df(dev_raw)

    print(f"Train samples: {len(train_df)}, Dev-task1 samples: {len(dev_task1_df)}")

    # Small sanity check
    print("\nSample rows from train:")
    display(train_df[["Text", "Aspect", "Valence", "Arousal"]].head())

    # ---------- 2. Train/dev split ----------
    train_split_df, dev_split_df = train_test_split(
        train_df,
        test_size=0.1,
        random_state=42
    )

    print(f"\nTrain split: {len(train_split_df)}, Dev split: {len(dev_split_df)}")

    # ---------- 3. Try multiple models ----------
    results = []

    for cfg in MODELS_TO_TRY:
        name = cfg["name"]
        lr = cfg["lr"]
        epochs = cfg["epochs"]
        desc = cfg["desc"]

        print("\n" + "-" * 50)
        print(f"Training model: {desc}")
        print(f"Model name: {name}, lr={lr}, epochs={epochs}")

        metrics = train_model(
            model_name=name,
            train_df=train_split_df,
            dev_df=dev_split_df,
            epochs=epochs,
            lr=lr
        )

        row = {
            "model_name": name,
            "desc": desc,
            "lr": lr,
            "epochs": epochs,
            **metrics
        }
        results.append(row)

        print(f"Done: {desc} → RMSE_VA={metrics['RMSE_VA']:.4f}, "
              f"PCC_V={metrics['PCC_V']:.4f}, PCC_A={metrics['PCC_A']:.4f}")

    results_df = pd.DataFrame(results)
    print("\nSummary of all runs:")
    display(results_df)

    # ---------- 4. Pick best model ----------
    best_idx = results_df["RMSE_VA"].idxmin()
    best_row = results_df.iloc[best_idx]

    best_model_name = best_row["model_name"]
    best_desc = best_row["desc"]
    best_ckpt = f"{best_model_name.replace('/', '_')}_best.pth"

    print("\n" + "=" * 70)
    print(f"BEST MODEL for {domain}: {best_desc}")
    print(f"RMSE_VA={best_row['RMSE_VA']:.4f}, "
          f"PCC_V={best_row['PCC_V']:.4f}, "
          f"PCC_A={best_row['PCC_A']:.4f}")
    print(f"Checkpoint file: {best_ckpt}")
    print("=" * 70)

    # ---------- 5. Load best model & generate submission on dev_task1 ----------
    tokenizer = AutoTokenizer.from_pretrained(best_model_name)
    best_model = TransformerVARegressor(best_model_name).to(DEVICE)
    best_model.load_state_dict(torch.load(best_ckpt, map_location=DEVICE))

    submission_path = os.path.join(
        SUBMISSION_DIR,
        f"eng_{domain}_dev_task1_predictions.jsonl"
    )

    generate_submission(
        model=best_model,
        df=dev_task1_df,
        tokenizer=tokenizer,
        device=DEVICE,
        fname=submission_path
    )

    print(f"\nSample predictions from {submission_path}:")
    with open(submission_path, "r") as f:
        for i, line in enumerate(f):
            if i >= 3:
                break
            print(json.dumps(json.loads(line), indent=2))

    return results_df, best_row, submission_path


## Run for both domains

In [None]:
all_results = {}

for domain in DOMAINS:
    results_df, best_row, submission_path = run_models_for_domain(domain)
    all_results[domain] = {
        "results": results_df,
        "best": best_row,
        "submission_path": submission_path
    }

print("\nFinished all domains.")


## Save summary table 

In [None]:
import glob
import os
import pandas as pd

domains = ["laptop", "restaurant"]
summary_rows = []

for dom in domains:
    # each domain produced a file like:
    # eng_laptop_dev_task1_predictions.jsonl
    pred_path = os.path.join("..", "submissions", f"eng_{dom}_dev_task1_predictions.jsonl")

    # load metrics that were printed earlier
    print(f"Reading metrics for domain: {dom}")
    # ⬇️ later we can add actual metric loading; right now make placeholder
    summary_rows.append({
        "domain": dom,
        "model": "BEST_MODEL_FROM_LOGS",
        "RMSE_VA": "N/A",
        "PCC_V": "N/A",
        "PCC_A": "N/A",
        "submission_file": pred_path
    })

summary_df = pd.DataFrame(summary_rows)
display(summary_df)

summary_csv = os.path.join("..", "submissions", "subtask1_summary_results.csv")
summary_df.to_csv(summary_csv, index=False)
print("Saved:", summary_csv)
