In [47]:
import argparse
import pandas as pd
import torch, os
import numpy as np
from tqdm import tqdm
from utils import convert_tagged_to_aspect, stratified_kfold_split, merge_spans
from transformers import AutoTokenizer
from eval_score import compute_competition_score
from collections import Counter

def resolve_word_labels(tokens, labels, offset_mapping):
    word_preds = []
    cur_labels = []
    cur_offsets = []

    for i, (tok, label, (start, end)) in enumerate(zip(tokens, labels, offset_mapping)):
        if tok in ['[CLS]', '[SEP]'] or (start == 0 and end == 0):
            continue  # Skip special tokens

        is_new_word = tok.startswith('▁')

        if is_new_word and cur_offsets:
            # Finalize previous word
            final_label = next((l for l in cur_labels if l != 'O'), 'O')
            # final_label = majority_vote_predictions(cur_labels)
            # frequency = Counter(cur_labels)
            # final_label = frequency.most_common(1)[0][0]
            # if frequency else 'O'
            # if 'B-O' in cur_labels or 'I-O' in cur_labels:
            #     final_label = 'B-O'
            # print(cur_labels)
            word_preds.append({
                'entity': final_label,
                'start': cur_offsets[0][0],
                'end': cur_offsets[-1][1]
            })
            cur_labels, cur_offsets = [], []

        cur_labels.append(label)
        cur_offsets.append((start, end))

    # Handle last word
    if cur_offsets:
        final_label = next((l for l in cur_labels if l != 'O'), 'O')
        # final_label = next((l for l in cur_labels if l != 'O'), 'O')
        # final_label = majority_vote_predictions(cur_labels)
        # final_label = Counter(cur_labels).most_common(1)[0][0] 
        # if cur_labels else 'O'
        # print(cur_labels)
        # if 'B-O' in cur_labels or 'I-O' in cur_labels:
        #     final_label = 'B-O'
        # if 'O' in cur_labels and final_label != 'O':
        #     final_label = 'O'
        word_preds.append({
            'entity': final_label,
            'start': cur_offsets[0][0],
            'end': cur_offsets[-1][1]
        })

    return word_preds

# Majority voting implementation
def majority_vote_predictions(all_pred_labels):
    """
    Apply majority voting across multiple prediction sets.
    
    Args:
        all_pred_labels: List of prediction lists from different models/seeds
    
    Returns:
        final_pred_labels: List of final predictions based on majority vote
    """
    num_models = len(all_pred_labels)
    seq_length = len(all_pred_labels[0])
    
    final_pred_labels = []
    
    for token_idx in range(seq_length):
        # Collect predictions for this token from all models
        token_predictions = [pred_labels[token_idx] for pred_labels in all_pred_labels]
        
        # Count votes for each prediction
        vote_counter = Counter(token_predictions)
        
        # Get the most common prediction (majority vote)
        majority_pred = vote_counter.most_common(1)[0][0]
        
        final_pred_labels.append(majority_pred)
    
    return final_pred_labels

# Apply majority voting

model_id = "microsoft/deberta-v3-small"
save_dir = "relabeled"

df0 = convert_tagged_to_aspect("data/Tagged_Titles_Train.tsv")
df1 = stratified_kfold_split(df0, n_splits=5, random_state=42)

asps = sorted(df0["Aspect Name"].unique())
label_list = ["O"] + [f"{p}-{a}" for a in asps for p in ("B","I")]

label2id   = {l:i for i,l in enumerate(label_list)}
id2label   = {i:l for l,i in label2id.items()}


tokenizer = AutoTokenizer.from_pretrained(model_id)


def strip_invis(s):
    return s.replace("\u200b","").replace("\u200c","").replace("\u200d","").replace("\ufeff","")

all_targets = []
all_preds = []
pred_df_data = []

for rid in tqdm(df1['Record Number'].unique()):
    sample = df1[df1['Record Number'] == rid]
    text   = sample['Title'].iat[0]
    text   = text.replace("\uFFFD", "Ü") 
    text   = strip_invis(text)  # Strip invisible characters
    gold   = sample[['Category','Aspect Name','Aspect Value']].values.tolist()
    category = int(sample["Category"].iat[0])
    
    # Full tokenization with all information
    tokenized = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)
    # print(tokenized.keys())

    
    all_pred_labels = np.load(f"{save_dir}/rid_{rid}/preds_labels.npy", allow_pickle=True).tolist()
    all_softmax = np.load(f"{save_dir}/rid_{rid}/softmax.npy", allow_pickle=True).tolist()
    
    assert len(all_pred_labels) == 7, f"Expected 7 sets of predictions, got {len(all_pred_labels)}"
    assert len(all_softmax) == 7, f"Expected 7 sets of softmax scores, got {len(all_softmax)}"
    
    
    # softmax = np.concatenate(all_softmax, axis=0).mean(axis=0)
    
    # preds = np.argmax(softmax, axis=-1)
    
    # pred_labels = [id2label[p] for p in preds]
    
    # softmax = all_softmax[0]
    # pred_labels = all_pred_labels[0]
    pred_labels = majority_vote_predictions(all_pred_labels)  # Use first 3 models for majority voting

    
    
    tokens_text = tokenizer.convert_ids_to_tokens(tokenized.input_ids[0], skip_special_tokens=False)
    offset_mapping = tokenized['offset_mapping'][0].cpu()
    
    
    word_level_preds = resolve_word_labels(tokens_text, pred_labels, offset_mapping)
    
    words = text.split()
    for i, word_pred in enumerate(word_level_preds):
        word = words[i]
        e_word = text[word_pred['start']:word_pred['end']].strip()
        assert word == e_word, f"Word mismatch: {word} != {e_word} at index {i}"
    
    assert i == len(words) - 1, f"Index mismatch: {i} != {len(words) - 1}"
    
    spans = merge_spans(word_level_preds)
    
    pred = [
        (sample["Category"].iat[0], 
         s["aspect_name"].strip(),
         text[s["start"]:s["end"]].strip())
        for s in spans
    ]
    
    pred_new = []
    # gold_vals = set((g[0], g[1], g[2]) for g in gold)
    
    for p in pred:
        found = False
        for g in gold:
            if p[0] == g[0] and p[1] == g[1] and p[2].strip() == g[2].strip():
                # If the prediction matches the gold value, keep it
                pred_new.append(p)
                found = True
                break
            # pred_new.append(p)
        if not found:
            # sample[sample["Category"].iat[0]] = p[0 and sample]
            pred_new.append((p[0], "O", p[2].strip()))
    # pred for pred in pred if pred in gold else 
    
    all_targets.extend(gold)
    all_preds.extend(pred_new)
    # gold_df = pd.DataFrame(gold, columns=["Category", "Aspect Name", "Aspect Value"])
    pred_df = pd.DataFrame(pred_new, columns=["Category", "Aspect Name", "Aspect Value"])
    # pred_df_data = []
    for _, row in pred_df.iterrows():
        pred_df_data.append({
            "Record Number": rid,
            "Category": row["Category"],
            "Title": text,
            "Aspect Name": row["Aspect Name"],
            "Aspect Value": row["Aspect Value"],
            "fold": sample["fold"].iat[0],
        })
    
    
    # break
final_pred_df = pd.DataFrame(pred_df_data)
final_pred_df.to_csv("relabeled/relabeled_majority_vote.tsv", sep="\t", index=False)
scores = compute_competition_score(all_targets, all_preds, beta=0.2)
print("Per-category scores:", scores["per_category"])
print("Overall score:    ", scores["overall_score"])

100%|██████████| 5000/5000 [00:21<00:00, 233.06it/s]


Per-category scores: {'1': 0.9932040787854665, '2': 0.9904206108351544}
Overall score:     0.9918123448103104


In [41]:
sample

Unnamed: 0,Record Number,Category,Title,Aspect Name,Aspect Value,fold
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Kompatible_Fahrzeug_Marke,MINI,4
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Kompatibles_Fahrzeug_Modell,1.6,4
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Herstellernummer,W10B16A W11B16A R50,4
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Produktart,Steuerkettensatz,4
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Herstellernummer,11311485400,4
5,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Im_Lieferumfang_Enthalten,Steuerkette,4
6,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Hersteller,FEBI,4
7,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Herstellernummer,31803,4


In [42]:
gold_df

Unnamed: 0,Category,Aspect Name,Aspect Value
0,2,Kompatible_Fahrzeug_Marke,MINI
1,2,Kompatibles_Fahrzeug_Modell,1.6
2,2,Herstellernummer,W10B16A W11B16A R50
3,2,Produktart,Steuerkettensatz
4,2,Herstellernummer,11311485400
5,2,Im_Lieferumfang_Enthalten,Steuerkette
6,2,Hersteller,FEBI
7,2,Herstellernummer,31803


In [43]:
pred_df

Unnamed: 0,Category,Aspect Name,Aspect Value
0,2,Kompatible_Fahrzeug_Marke,MINI
1,2,O,1.6 W10B16A
2,2,O,W11B16A
3,2,O,R50
4,2,O,Steuerkettensatz
5,2,Herstellernummer,11311485400
6,2,Im_Lieferumfang_Enthalten,Steuerkette
7,2,Hersteller,FEBI
8,2,Herstellernummer,31803


In [44]:
df0

Unnamed: 0,Record Number,Category,Title,Aspect Name,Aspect Value
0,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Kompatible_Fahrzeug_Marke,MINI
1,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Kompatibles_Fahrzeug_Modell,1.6
2,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Herstellernummer,W10B16A W11B16A R50
3,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Produktart,Steuerkettensatz
4,1,2,MINI 1.6 W10B16A W11B16A R50 Steuerkettensatz ...,Herstellernummer,11311485400
...,...,...,...,...,...
47196,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,O,für
47197,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Kompatible_Fahrzeug_Marke,Opel
47198,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,Kompatibles_Fahrzeug_Modell,Saab 1.8
47199,5000,2,BOSCH Zahnriemen + Rollensatz + MEYLE Wasserpu...,O,/


In [None]:
# Single run example output:
# Per-category scores: {'1': 0.9125085155235677, '2': 0.8913445070630057}
# Overall score:     0.9019265112932867

# Num average 3
# Per-category scores: {'1': 0.9132383997713068, '2': 0.8933765286563429}
# Overall score:     0.9033074642138248

# Num average 5
# Per-category scores: {'1': 0.9137532273150749, '2': 0.8945783405757166}
# Overall score:     0.9041657839453958

# Num average 7
# Per-category scores: {'1': 0.9146475727093677, '2': 0.8950353910277707}
# Overall score:     0.9048414818685693

# Majority voting
# Per-category scores: {'1': 0.9149438253586415, '2': 0.8967396044613878}
# Overall score:     0.9058417149100146

In [26]:
gold_vals

{('2', 'Hersteller', 'BOSCH'),
 ('2', 'Hersteller', 'MEYLE'),
 ('2', 'Im_Lieferumfang_Enthalten', 'Rollensatz'),
 ('2', 'Im_Lieferumfang_Enthalten', 'Wasserpumpe'),
 ('2', 'Kompatible_Fahrzeug_Marke', 'Opel'),
 ('2', 'Kompatibles_Fahrzeug_Modell', '1.8'),
 ('2', 'Kompatibles_Fahrzeug_Modell', 'Saab 1.8'),
 ('2', 'O', '+'),
 ('2', 'O', '/'),
 ('2', 'O', 'für'),
 ('2', 'Produktart', 'Zahnriemen')}

In [None]:
all_targets = []
all_preds = []
pred_df_data = []

for rid in tqdm(df1['Record Number'].unique()):
    sample = df1[df1['Record Number'] == rid]
    text   = sample['Title'].iat[0]
    text   = text.replace("\uFFFD", "Ü") 
    text   = strip_invis(text)  # Strip invisible characters
    gold   = sample[['Category','Aspect Name','Aspect Value']].values.tolist()
    category = int(sample["Category"].iat[0])
    
    # Full tokenization with all information
    tokenized = tokenizer(text, return_tensors="pt", return_offsets_mapping=True)

    all_pred_labels = np.load(f"{save_dir}/rid_{rid}/preds_labels.npy", allow_pickle=True).tolist()
    all_softmax = np.load(f"{save_dir}/rid_{rid}/softmax.npy", allow_pickle=True).tolist()

    assert len(all_pred_labels) == 7, f"Expected 7 sets of predictions, got {len(all_pred_labels)}"
    assert len(all_softmax) == 7, f"Expected 7 sets of softmax scores, got {len(all_softmax)}"

    tokens_text = tokenizer.convert_ids_to_tokens(tokenized.input_ids[0], skip_special_tokens=False)
    offset_mapping = tokenized['offset_mapping'][0].cpu()

    # Get predictions from all models
    all_preds_for_record = []
    for pred_labels in all_pred_labels:
        word_level_preds = resolve_word_labels(tokens_text, pred_labels, offset_mapping)
        
        # Validate word alignment
        words = text.split()
        for i, word_pred in enumerate(word_level_preds):
            word = words[i]
            e_word = text[word_pred['start']:word_pred['end']].strip()
            assert word == e_word, f"Word mismatch: {word} != {e_word} at index {i}"
        
        spans = merge_spans(word_level_preds)
        
        pred_set = [
            (sample["Category"].iat[0], 
            s["aspect_name"].strip(),
            text[s["start"]:s["end"]].strip())
            for s in spans
        ]
        
        all_preds_for_record.extend(pred_set)

    # Collect all unique predictions across all models
    all_unique_preds = set()
    for pred_set in all_preds_for_record:
        all_unique_preds.add(pred_set)

    # Filter predictions - keep only those that match gold from ANY model
    pred_new = []
    for unique_pred in all_unique_preds:
        found_in_gold = False
        for g in gold:
            if (unique_pred[0] == g[0] and 
                unique_pred[1] == g[1] and 
                unique_pred[2].strip() == g[2].strip()):
                pred_new.append(unique_pred)
                found_in_gold = True
                break
        
        if not found_in_gold:
            pred_new.append((unique_pred[0], "O", unique_pred[2].strip()))

    all_targets.extend(gold)
    all_preds.extend(pred_new)
    
    pred_df = pd.DataFrame(pred_new, columns=["Category", "Aspect Name", "Aspect Value"])
    # pred_df_data = []
    for _, row in pred_df.iterrows():
        pred_df_data.append({
            "Record Number": rid,
            "Category": row["Category"],
            "Title": text,
            "Aspect Name": row["Aspect Name"],
            "Aspect Value": row["Aspect Value"],
            "fold": sample["fold"].iat[0],
        })
    
    
    # break
final_pred_df = pd.DataFrame(pred_df_data)
final_pred_df.to_csv("relabeled/relabeled_any_one.tsv", sep="\t", index=False)
    
scores = compute_competition_score(all_targets, all_preds, beta=0.2)
print("Per-category scores:", scores["per_category"])
print("Overall score:    ", scores["overall_score"])

 13%|█▎        | 629/5000 [00:03<00:25, 173.27it/s]

In [28]:
pred

[('2', 'Hersteller', 'BOSCH'),
 ('2', 'Produktart', 'Zahnriemen'),
 ('2', 'O', '+'),
 ('2', 'Produktart', 'Rollensatz'),
 ('2', 'O', '+'),
 ('2', 'Hersteller', 'MEYLE'),
 ('2', 'Im_Lieferumfang_Enthalten', 'Wasserpumpe'),
 ('2', 'O', 'für'),
 ('2', 'Kompatible_Fahrzeug_Marke', 'Opel'),
 ('2', 'Kompatible_Fahrzeug_Marke', 'Saab'),
 ('2', 'Kompatibles_Fahrzeug_Modell', '1.8'),
 ('2', 'O', '/'),
 ('2', 'Kompatibles_Fahrzeug_Modell', '1.8')]