In [None]:
import pandas as pd
import numpy as np

## 1. Load the dataset

In [None]:
# Load data
file_path = "results.xlsx"
df = pd.read_excel(file_path, sheet_name="Tabelle1")

# Preview
df.head()

## 2. Filter valid runs
We only keep runs with the full 200 test instances.  

In [None]:
df = df[df["num_instances"] == 200].copy()
print(f"Valid runs after filtering: {len(df)}")

## 3. Define helper functions
We implement:
- **Median–interpercentile scaling (10–90%)**  
- **AUTORANK computation with custom weights**  

In [None]:
# Normalization
def median_interpercentile_scale(values, low=10, high=90):
    values = np.array(values)
    median = np.median(values)
    p_low, p_high = np.percentile(values, [low, high])
    denom = p_high - p_low if p_high > p_low else 1.0
    return (values - median) / denom

# AUTORANK with custom weights
def compute_autorank_weighted(data, weights, directions, low=10, high=90):
    # 1. Median–interpercentile scaling
    scaled = data.apply(lambda col: median_interpercentile_scale(col, low, high))
    
    # 2. Flip orientation if lower=better
    for col, up in zip(scaled.columns, directions):
        if not up:
            scaled[col] = -scaled[col]
    
    # 3. Weighted average
    weights = np.array(weights, dtype=float)
    weights = weights / weights.sum()
    avg_score = np.dot(scaled.values, weights)
    
    # 4. Rescale to [1, N]
    N = len(avg_score)
    min_val, max_val = avg_score.min(), avg_score.max()
    rescaled = 1 + (avg_score - min_val) * (N - 1) / (max_val - min_val) if max_val > min_val else np.ones_like(avg_score)
    
    # 5. Final AUTORANK mapping (1=best, N=worst)
    min_val, max_val = rescaled.min(), rescaled.max()
    autorank = 1 + (max_val - rescaled) * (N - 1) / (max_val - min_val) if max_val > min_val else np.ones_like(rescaled)
    
    return avg_score, rescaled, autorank

## 4. Define metrics and weights
We evaluate with 3 metrics:
- **RMSE** (lower=better)  
- **MeaningBERT original–output** (higher=better)  
- **MeaningBERT reference–output** (higher=better)  

Weights:  
- RMSE = 0.50  
- MB-orig = 0.167  
- MB-ref = 0.333  

In [None]:
metrics = ["rmse", "meaningbert-orig", "meaningbert-ref"]
weights = [0.5, 0.167, 0.333]
directions = [False, True, True]  # RMSE lower=better

## 5. Compute AUTORANK scores

In [None]:
subdf = df[metrics]
avg_score, rescaled, autorank = compute_autorank_weighted(subdf, weights, directions)

df["AvgScore"] = avg_score
df["Rescaled"] = rescaled
df["AUTORANK"] = autorank

## 6. Rankings
We generate:
1. **All runs ranking**  
2. **Best run per team ranking**  

In [None]:
# All runs
all_runs = df.sort_values("AUTORANK")

# Best run per team
best_runs = df.loc[df.groupby("teamname")["AUTORANK"].idxmin()]
best_runs = best_runs.sort_values("AUTORANK")

all_runs.head(), best_runs.head()

## 7. Export LaTeX tables

In [None]:
def to_latex_table(data, caption, label):
    cols = ["teamname", "modelname"] + metrics + ["AvgScore", "AUTORANK"]
    table = data[cols].copy()
    table["AvgScore"] = table["AvgScore"].round(3)
    table["AUTORANK"] = table["AUTORANK"].round(2)
    return table.to_latex(
        float_format="%.3f",
        index=False,
        escape=False,
        caption=caption,
        label=label,
        column_format="llccccc",
        bold_rows=False
    )

latex_all = to_latex_table(all_runs, "AUTORANK results for all submitted runs (custom weighting).", "tab:autorank_all")
latex_best = to_latex_table(best_runs, "AUTORANK results using the best run per team (custom weighting).", "tab:autorank_best")

print(latex_all)
print(latex_best)