# Baseline 1 Scores - No Thresholds (Raw Argmax Predictions)
Minimal code to get baseline 1 scores from inference_threshold_refine_mean_sd_categorywise.py

In [1]:
import re
from pathlib import Path
from typing import Dict, List, Tuple

import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from transformers import AutoModelForTokenClassification, AutoTokenizer

from eval_score_fixed import compute_competition_score
from utils import convert_tagged_to_aspect, stratified_kfold_split

In [2]:
# Configuration
MODEL_DIR = Path("/data/sahmed9/E25/multi_epoch_checkpoints/deberta-v3-base/lr_1_e_neg_4/o_weight_1/o_weight_1/fold0/checkpoint-875")  # Change this
DATA_PATH = Path("data/Tagged_Titles_Train.tsv")
FOLD = 0
NUM_FOLDS = 5
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Load data and build labels
df0 = convert_tagged_to_aspect(str(DATA_PATH))
df_split = stratified_kfold_split(df0, n_splits=NUM_FOLDS, random_state=SEED)

aspects = sorted({asp for asp in df_split["Aspect Name"].unique() if asp})
label_list = ["O"] + [f"{prefix}-{aspect}" for aspect in aspects for prefix in ("B", "I")]
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

valid_df = df_split[df_split["fold"] == FOLD]
print(f"Validation samples: {len(valid_df['Record Number'].unique())}")

Validation samples: 1000


In [4]:
# Load model
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_DIR)
model.to(DEVICE)
model.eval()
print(f"Model loaded on {DEVICE}")

Model loaded on cuda


In [5]:
def extract_spans(seq: List[str], positions: List[int], offsets: List[Tuple[int, int]], text: str) -> List[Tuple[str, str]]:
    """Extract spans from token-level predictions."""
    spans = []
    idx = 0
    while idx < len(seq):
        label = seq[idx]
        if label.startswith("B-"):
            aspect = label[2:]
            start_pos = positions[idx]
            end_pos = start_pos
            idx += 1
            while idx < len(seq) and seq[idx] == f"I-{aspect}":
                end_pos = positions[idx]
                idx += 1
            start_char, _ = offsets[start_pos]
            _, end_char = offsets[end_pos]
            span_text = text[start_char:end_char].strip()
            spans.append((aspect, span_text))
        else:
            idx += 1
    return spans

In [6]:
# Build predictions (Baseline 1: raw argmax, no thresholds)
all_targets = []
all_preds = []

for rid in tqdm(valid_df["Record Number"].unique(), desc="Processing samples"):
    sample = valid_df[valid_df["Record Number"] == rid]
    text = sample["Title"].iat[0]
    category = str(sample["Category"].iat[0])
    gold = sample[["Category", "Aspect Name", "Aspect Value"]].values.tolist()
    
    # Tokenize
    encoded = tokenizer(text, return_tensors="pt", return_offsets_mapping=True, padding=True, truncation=True)
    offsets = encoded["offset_mapping"][0].tolist()
    
    # Create gold labels
    labels = [-100] * len(offsets)
    for cat, asp, val in gold:
        val_str = str(val)
        for match in re.finditer(re.escape(val_str), text):
            start_char, end_char = match.span()
            b_assigned = False
            for tidx, (start, end) in enumerate(offsets):
                if start >= end_char:
                    break
                if end <= start_char:
                    continue
                if not b_assigned:
                    labels[tidx] = label2id[f"B-{asp}"]
                    b_assigned = True
                else:
                    labels[tidx] = label2id[f"I-{asp}"]
    
    # Get predictions (raw argmax - no thresholds)
    encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
    encoded.pop("offset_mapping")
    
    with torch.no_grad():
        logits = model(**encoded).logits
        preds = logits.argmax(dim=-1).cpu()[0].numpy()
    
    # Filter valid tokens (exclude -100)
    seq_p = []
    seq_l = []
    token_idxs = []
    for idx, l_id in enumerate(labels):
        if l_id == -100:
            continue
        seq_p.append(id2label[int(preds[idx])])
        seq_l.append(label_list[l_id])
        token_idxs.append(idx)
    
    # Extract spans
    for aspect, span in extract_spans(seq_l, token_idxs, offsets, text):
        if aspect != "O":
            all_targets.append({
                "record_id": str(rid),
                "category": category,
                "aspect_name": aspect,
                "span": span,
            })
    
    for aspect, span in extract_spans(seq_p, token_idxs, offsets, text):
        if aspect != "O":
            all_preds.append({
                "record_id": str(rid),
                "category": category,
                "aspect_name": aspect,
                "span": span,
            })

Processing samples:   0%|          | 0/1000 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Processing samples: 100%|██████████| 1000/1000 [00:15<00:00, 63.26it/s]


In [7]:
# Compute scores
comp = compute_competition_score(all_targets, all_preds, beta=0.2)

print("="*60)
print("Baseline 1: No thresholds (raw argmax predictions)")
print("="*60)
print(f"Overall score: {comp['overall_score']:.6f}")
print("\nPer-category scores:")
for cat, score in sorted(comp["per_category"].items()):
    print(f"  Category {cat}: {score:.6f}")

Baseline 1: No thresholds (raw argmax predictions)
Overall score: 0.896926

Per-category scores:
  Category 1: 0.892044
  Category 2: 0.901807


In [8]:
{'per_category': {'1': 0.892044481550705, '2': 0.9018074896156842},
 'overall_score': 0.8969259855831946}

{'per_category': {'1': 0.892044481550705, '2': 0.9018074896156842},
 'overall_score': 0.8969259855831946}

In [9]:
all_targets

[{'record_id': '2',
  'category': '1',
  'aspect_name': 'Hersteller',
  'span': 'ATE'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Modell',
  'span': 'Power Disc'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Produktart',
  'span': 'Bremsenset'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Kompatible_Fahrzeug_Marke',
  'span': 'Mercedes'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Kompatibles_Fahrzeug_Modell',
  'span': 'SLC'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Kompatibles_Fahrzeug_Modell',
  'span': 'SLK'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Einbauposition',
  'span': 'Vorne'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Bremsscheiben-Aussendurchmesser',
  'span': '295MM'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Einbauposition',
  'span': 'Hinten'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Bremsscheiben-Aussendurchmesser',
  'span': '300MM'

In [10]:
all_preds

[{'record_id': '2',
  'category': '1',
  'aspect_name': 'Hersteller',
  'span': 'ATE'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Modell',
  'span': 'Power Disc'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Produktart',
  'span': 'Bremsenset'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Kompatible_Fahrzeug_Marke',
  'span': 'Mercedes'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Kompatibles_Fahrzeug_Modell',
  'span': 'SLC'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Kompatibles_Fahrzeug_Modell',
  'span': 'SLK'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Einbauposition',
  'span': 'Vorne'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Bremsscheiben-Aussendurchmesser',
  'span': '295MM'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Einbauposition',
  'span': 'Hinten'},
 {'record_id': '2',
  'category': '1',
  'aspect_name': 'Bremsscheiben-Aussendurchmesser',
  'span': '300MM'

In [11]:
def calculate_fbeta(pred, target, beta=0.2):
    # pred, target are sets of tuples like {(r_id, span), ...}
    tp = pred & target  # true positives
    fp = pred - target  # false positives
    fn = target - tp    # false negatives

    beta2 = beta ** 2
    precision = len(tp) / (len(tp) + len(fp)) if (len(tp) + len(fp)) > 0 else 0.0
    recall = len(tp) / (len(tp) + len(fn)) if (len(tp) + len(fn)) > 0 else 0.0

    if precision == 0 and recall == 0:
        fbeta = 0.0
    else:
        fbeta = (1 + beta2) * precision * recall / (beta2 * precision + recall)

    return fbeta


In [12]:
def compute_fbeta_scores(all_targets, all_preds, beta=0.2):
    """
    Compute per-category and overall weighted F-beta scores.

    Args:
        all_targets (list[dict]): Ground truth records with columns like ['category', 'aspect_name', ...].
        all_preds (list[dict]): Prediction records with same structure as targets.
        beta (float): Beta value for F-beta computation. Default is 0.2 (precision-focused).

    Returns:
        dict: {
            'per_category': {'1': float, '2': float},
            'overall_score': float
        }
    """
    import pandas as pd

    # Convert to DataFrames
    df_target = pd.DataFrame(all_targets)
    df_pred = pd.DataFrame(all_preds)

    results = {'per_category': {}}
    category_scores = []

    # Iterate over both categories
    for cat in ['1', '2']:
        df_target_cat = df_target[df_target['category'] == cat]
        df_pred_cat = df_pred[df_pred['category'] == cat]

        cat_score = 0.0
        total = len(df_target_cat)

        # Iterate over all aspect names within this category
        for aspect_name in df_target_cat['aspect_name'].unique():
            target_rows = df_target_cat[df_target_cat['aspect_name'] == aspect_name]
            pred_rows = df_pred_cat[df_pred_cat['aspect_name'] == aspect_name]

            # Convert rows into sets of tuples (excluding metadata columns)
            target = set(target_rows.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))
            pred = set(pred_rows.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))

            # Weighted F-beta for this aspect
            weight = len(target_rows) / total
            cat_score += weight * calculate_fbeta(pred, target, beta=beta)

        results['per_category'][cat] = cat_score
        category_scores.append(cat_score)

    # Compute overall mean across categories
    overall = sum(category_scores) / len(category_scores)
    results['overall_score'] = overall

    return results


In [13]:
compute_fbeta_scores(all_targets, all_preds)

{'per_category': {'1': 0.892044481550705, '2': 0.9018074896156842},
 'overall_score': 0.8969259855831946}

In [14]:
df_target = pd.DataFrame(all_targets)
df_pred = pd.DataFrame(all_preds)

final_score = 0
for cat in ['1','2']:
    df_target_cat = df_target.groupby('category').get_group(cat)
    df_pred_cat = df_pred.groupby('category').get_group(cat)
    cat_score = 0
    total = len(df_target_cat)
    for aspect_name in df_target_cat.aspect_name.unique():
        target = df_target_cat.groupby('aspect_name').get_group(aspect_name)
        pred = df_pred_cat.groupby('aspect_name').get_group(aspect_name)
        target = set(target.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))
        pred = set(pred.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))
        # break
        cat_score += (len(target)/total) * calculate_fbeta(pred, target)
        
    final_score += cat_score
final_score /= 2
        

KeyError: 'Oe/Oem_Referenznummer(N)'

In [None]:
# Convert prediction and target lists into DataFrames
df_target = pd.DataFrame(all_targets)
df_pred = pd.DataFrame(all_preds)

final_score = 0.0  # overall mean F-beta score across all categories

# Iterate over each category (assumed categorical strings '1' and '2')
for cat in ['1', '2']:
    # Filter data for the current category
    df_target_cat = df_target[df_target['category'] == cat]
    df_pred_cat = df_pred[df_pred['category'] == cat]
    
    cat_score = 0.0  # cumulative weighted F-beta score for this category
    total = len(df_target_cat)  # number of samples in this category

    # Loop through each unique aspect name within the category
    for aspect_name in df_target_cat['aspect_name'].unique():
        # Extract target and prediction rows for this aspect
        target_rows = df_target_cat[df_target_cat['aspect_name'] == aspect_name]
        pred_rows = df_pred_cat[df_pred_cat['aspect_name'] == aspect_name]

        # Convert rows (excluding metadata) into sets of tuples
        # Each tuple uniquely identifies an entity span
        target = set(target_rows.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))
        pred = set(pred_rows.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))

        # Compute weighted F-beta score for this aspect
        # Weight by the relative number of target samples in this aspect
        aspect_weight = len(target_rows) / total
        cat_score += aspect_weight * calculate_fbeta(pred, target)

    # Add this category's contribution to the overall score
    final_score += cat_score

# Average the scores across categories
final_score /= 2


In [None]:
comp

{'per_category': {'1': 0.888177044279952, '2': 0.9007080715287676},
 'overall_score': 0.8944425579043598,
 'global_counts': {'tp': 6050, 'fp': 585, 'fn': 1489}}

In [None]:
final_score
{'per_category': {'1': 0.892044481550705, '2': 0.9018074896156842},
 'overall_score': 0.8969259855831946}

0.8969259855831946

In [None]:
len(pred & target)

407

In [None]:
set(target.drop(columns=['category', 'aspect_name']).apply(tuple, axis=1))


{('1006', 'BOSCH'),
 ('1048', 'CONTI'),
 ('1049', 'FEBI BILSTEIN'),
 ('1053', 'INA'),
 ('1056', 'SKF'),
 ('1075', 'GATES'),
 ('1079', 'AIC'),
 ('1088', 'CONTITECH'),
 ('1098', 'FEBI BILSTEIN'),
 ('1099', 'GATES'),
 ('1100', 'CONTI'),
 ('1112', 'GATES'),
 ('1119', 'CONTINENTAL CTAM'),
 ('1142', 'FEBI'),
 ('1147', 'INA'),
 ('1148', 'SNR'),
 ('1149', 'GATES'),
 ('1161', 'FAI AutoParts'),
 ('1193', 'INA'),
 ('1211', 'INA'),
 ('1234', 'BOSCH'),
 ('1245', 'CONTI'),
 ('1261', 'Gates'),
 ('1263', 'CONTITECH'),
 ('1291', 'BOSCH'),
 ('1308', 'INA'),
 ('1327', 'VAICO'),
 ('1331', 'CONTINENTAL CTAM'),
 ('1333', 'SWAG'),
 ('134', 'FAI AutoParts'),
 ('1341', 'INA'),
 ('1355', 'SKF'),
 ('1357', 'FEBI BILSTEIN'),
 ('1359', 'CONTINENTAL CTAM'),
 ('1372', 'FEBI'),
 ('1390', 'CONTI'),
 ('1416', 'CONTINENTAL'),
 ('1418', 'CONTITECH'),
 ('1422', 'CONTI'),
 ('1422', 'CONTITECH'),
 ('143', 'CONTI'),
 ('1449', 'CONTI'),
 ('1454', 'CONTI'),
 ('1466', 'INA'),
 ('1496', 'DAYCO'),
 ('1516', 'QH'),
 ('1540', 'SKF'

In [None]:
df_pred.aspect_name.unique()

array(['Hersteller', 'Modell', 'Produktart', 'Kompatible_Fahrzeug_Marke',
       'Kompatibles_Fahrzeug_Modell', 'Einbauposition',
       'Bremsscheiben-Aussendurchmesser', 'Im_Lieferumfang_Enthalten',
       'Herstellernummer', 'Kompatibles_Fahrzeug_Jahr', 'SAE_Viskosität',
       'Material', 'Bremsscheibenart', 'Stärke', 'Maßeinheit',
       'Produktlinie', 'Anzahl_Der_Einheiten', 'Menge', 'Besonderheiten',
       'Anwendung', 'Oe/Oem_Referenznummer(N)'], dtype=object)

In [None]:
df_target.aspect_name.unique()

array(['Hersteller', 'Modell', 'Produktart', 'Kompatible_Fahrzeug_Marke',
       'Kompatibles_Fahrzeug_Modell', 'Einbauposition',
       'Bremsscheiben-Aussendurchmesser', 'Im_Lieferumfang_Enthalten',
       'Herstellernummer', 'Kompatibles_Fahrzeug_Jahr',
       'Oe/Oem_Referenznummer(N)', 'Material', 'Größe',
       'Bremsscheibenart', 'Stärke', 'Maßeinheit', 'Produktlinie',
       'Anzahl_Der_Einheiten', 'Anwendung', 'Besonderheiten', 'Menge',
       'Oberflächenbeschaffenheit', 'Technologie', 'Farbe',
       'SAE_Viskosität'], dtype=object)

In [None]:
df_target.groupby('category').get_group('1')

Unnamed: 0,record_id,category,aspect_name,span
0,2,1,Hersteller,ATE
1,2,1,Modell,Power Disc
2,2,1,Produktart,Bremsenset
3,2,1,Kompatible_Fahrzeug_Marke,Mercedes
4,2,1,Kompatibles_Fahrzeug_Modell,SLC
...,...,...,...,...
7481,4972,1,Einbauposition,Vorne
7482,4972,1,Im_Lieferumfang_Enthalten,Beläge
7483,4972,1,Im_Lieferumfang_Enthalten,Scheiben
7484,4972,1,Kompatible_Fahrzeug_Marke,Skoda


In [None]:
df_target.groupby('category').get_group('2')

Unnamed: 0,record_id,category,aspect_name,span
10,17,2,Hersteller,CONTINENTAL CTAM
11,17,2,Im_Lieferumfang_Enthalten,Wasserpumpe
12,17,2,Produktart,Zahnriemensatz
13,17,2,Kompatible_Fahrzeug_Marke,OPEL
14,29,2,Hersteller,CONTITECH
...,...,...,...,...
7534,4995,2,Im_Lieferumfang_Enthalten,THERMOSTAT
7535,4998,2,Hersteller,FAI AutoParts
7536,4998,2,Herstellernummer,TCK62
7537,4998,2,Im_Lieferumfang_Enthalten,Timing Chain Kit
