# Task B — Ablation Analysis

This notebook evaluates how similarity rankings change under different scoring configurations (weights and multipliers).
It uses the joined RFQ + reference dataset produced in Task B.

In [2]:
import pandas as pd
from utils.similarity_score import DEFAULT_CONFIG, run_topk_with_config, compare_topk
import numpy as np
import os


joined_df = pd.read_csv("joined_all.csv")
OUT = "ablation_out"

## Define configurations
- baseline: DEFAULT_CONFIG (w_dim=0.60, w_cat=0.30, w_cp=0.10)
- dims_only: dimensions only (all other weights 0)
- cat_only: categorical only
- cp_only: mechanical proximity only (Rm_mid)
- grade_dom: final ordering mostly via grade/category multipliers (tiny weights)

In [3]:
configs = {
    'baseline': {},
    'dims_only': {'w_dim': 1.0, 'w_cat': 0.0, 'w_cp': 0.0},
    'cat_only':  {'w_dim': 0.0, 'w_cat': 1.0, 'w_cp': 0.0},
    'cp_only':   {'w_dim': 0.0, 'w_cat': 0.0, 'w_cp': 1.0},
    'grade_dom': {'w_dim': 1e-6, 'w_cat': 1e-6, 'w_cp': 1e-6},
}
top_k = 3
allow_fallback = True  # allow category fallback with penalty


topk_tables = {}
for name, cfg in configs.items():
    df = run_topk_with_config(joined_df, cfg, name, top_k=top_k, allow_category_fallback=allow_fallback)
    topk_tables[name] = df
    df.to_csv(f"{OUT}/top3_{name}.csv", index=False)

{k: len(v) for k, v in topk_tables.items()}

{'baseline': 2832,
 'dims_only': 2832,
 'cat_only': 2832,
 'cp_only': 2832,
 'grade_dom': 2832}

## Compare variants to baseline
Compute per-query comparisons and aggregate summary stats.

In [4]:

baseline = topk_tables['baseline']
summ_rows = []

for name, tbl in topk_tables.items():
    if name == 'baseline':
        continue
    comp = compare_topk(baseline, tbl, k=top_k)
    comp_path = os.path.join(OUT, f"ablation_compare_baseline_vs_{name}.csv")
    comp.to_csv(comp_path, index=False)
    summ = {
        'config': name,
        'mean_jaccard@k': comp['jaccard_k'].mean(),
        'top1_changed_pct': 100 * (1 - comp['top1_same'].mean()),
        'mean_rank_change': comp['mean_rank_change'].mean(),
        'avg_added': comp['added'].mean(),
        'avg_removed': comp['removed'].mean(),
    }
    summ_rows.append(summ)
summary = pd.DataFrame(summ_rows)
summary_path = os.path.join(OUT, "ablation_summary.csv")
summary.to_csv(summary_path, index=False)
summary.sort_values('mean_jaccard@k', ascending=False)


Unnamed: 0,config,mean_jaccard@k,top1_changed_pct,mean_rank_change,avg_added,avg_removed
3,grade_dom,0.699274,18.703507,0.239837,0.699256,0.697131
0,dims_only,0.642012,24.654623,0.248759,0.855473,0.85441
2,cp_only,0.514984,34.643996,0.307007,1.215728,1.213603
1,cat_only,0.418119,47.715197,0.434532,1.506908,1.501594
