# 09 — Evaluation (Woche 2)

Vergleich **TF-IDF** vs. **SBERT** (falls verfügbar) auf einem kleinen Ground-Truth.

- Robust gegenüber Arbeitsverzeichnis: lädt `../data/sample_corpus.json` oder `data/sample_corpus.json`
- Metriken: Precision@k, MRR, nDCG
- Speed-Benchmark: Latenz pro Query
- Export: CSV mit Einzelresultaten

> Hinweis: SBERT wird **optional** geladen. Wenn das Paket/Modell fehlt, läuft alles mit TF-IDF weiter.

In [1]:
from __future__ import annotations
import os, json, time, importlib.util, warnings
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from tqdm.auto import TqdmExperimentalWarning
warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# ---- Pfade & Daten laden (robust) ----
def load_texts() -> List[str]:
    candidates = [
        Path.cwd() / "data" / "sample_corpus.json",
        Path.cwd() / ".." / "data" / "sample_corpus.json",
        Path(__file__).parent / ".." / "data" / "sample_corpus.json" if "__file__" in globals() else Path.cwd()/"data"/"sample_corpus.json"
    ]
    for p in candidates:
        try:
            if p.exists():
                return json.loads(p.read_text(encoding="utf-8"))
        except Exception:
            pass
    # Fallback-Mini-Korpus
    return [
        "Die Snare ist zu laut und harsch",
        "Kick zu weich, es fehlt der Punch",
        "Vocals klingen nasal, 800 Hz absenken",
        "Bass maskiert die Kick, Sidechain nötig",
        "S-Laute sind scharf, De-Esser einsetzen",
    ]

texts = load_texts()
len(texts), texts[:3]

(10,
 ['Die Kickdrum pumpt im Mix, aber die Snare wirkt zu dünn.',
  'Vocals sitzen zu weit hinten, mehr Präsenz im 3 kHz Bereich.',
  'Die Snare klingt trocken und etwas hart, vielleicht mehr Raumanteil.'])

## Ranker: TF-IDF + (optional) SBERT

In [2]:
# ---- TF-IDF Ranker ----
class TfidfRanker:
    def __init__(self, docs: List[str]):
        self.vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1)
        self.X = self.vectorizer.fit_transform(docs)
        self.docs = docs

    def rank(self, query: str, k: int = 5) -> Tuple[List[int], List[float]]:
        qv = self.vectorizer.transform([query])
        sims = (qv @ self.X.T).toarray().ravel()
        order = np.argsort(-sims)
        topk = order[:k].tolist()
        return topk, sims[topk].tolist()

# ---- SBERT Ranker (optional) ----
def try_sbert(docs: List[str], model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
    try:
        if importlib.util.find_spec("sentence_transformers") is None:
            return None
        from sentence_transformers import SentenceTransformer
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            model = SentenceTransformer(model_name, device="cpu")
            doc_emb = model.encode(docs, normalize_embeddings=True)
        class _Ranker:
            def __init__(self, model, doc_emb):
                self.model, self.doc_emb = model, doc_emb
            def rank(self, query: str, k: int = 5):
                qv = self.model.encode([query], normalize_embeddings=True)
                sims = (qv @ self.doc_emb.T).ravel()
                order = np.argsort(-sims)
                topk = order[:k].tolist()
                return topk, sims[topk].tolist()
        return _Ranker(model, doc_emb)
    except Exception as e:
        print("[SBERT] nicht verfügbar:", e)
        return None

tfidf_ranker = TfidfRanker(texts)
sbert_ranker = try_sbert(texts)
print("SBERT aktiv:", sbert_ranker is not None)

SBERT aktiv: True


## Ground Truth (Mini-Set)
Die GT mappt Suchanfragen auf relevante Dokument-Indizes (anpassbar/erweiterbar).

In [3]:
GT: Dict[str, List[int]] = {
    "snare zu laut": [0],
    "kick mehr punch": [1],
    "vocals nasal 800 hz": [2],
    "bass maskiert kick": [3],
    "s-laute scharf de-esser": [4],
}
list(GT.items())[:2]

[('snare zu laut', [0]), ('kick mehr punch', [1])]

## Metriken: Precision@k, MRR, nDCG

In [4]:
def precision_at_k(pred: List[int], rel: List[int], k: int) -> float:
    P = pred[:k]
    hits = sum(1 for i in P if i in rel)
    return hits / max(1, len(P))

def reciprocal_rank(pred: List[int], rel: List[int]) -> float:
    for idx, p in enumerate(pred, 1):
        if p in rel:
            return 1.0 / idx
    return 0.0

def ndcg_at_k(pred: List[int], rel: List[int], k: int) -> float:
    def dcg(items):
        s = 0.0
        for i, it in enumerate(items, 1):
            gain = 1.0 if it in rel else 0.0
            s += gain / np.log2(i + 1)
        return s
    DCG = dcg(pred[:k])
    IDCG = dcg(rel[:k]) if rel else 1.0
    return DCG / max(IDCG, 1e-9)

def evaluate(ranker, queries: Dict[str, List[int]], k: int = 5) -> pd.DataFrame:
    rows = []
    for q, rel in queries.items():
        t0 = time.perf_counter()
        pred, sims = ranker.rank(q, k=max(k, 5))
        dt = (time.perf_counter() - t0) * 1000
        rows.append({
            "query": q,
            "P@1": precision_at_k(pred, rel, 1),
            "P@3": precision_at_k(pred, rel, 3),
            "P@5": precision_at_k(pred, rel, 5),
            "MRR": reciprocal_rank(pred, rel),
            "nDCG@5": ndcg_at_k(pred, rel, 5),
            "latency_ms": dt,
            "pred": pred,
            "sims": [round(float(x), 4) for x in sims]
        })
    return pd.DataFrame(rows)

df_tfidf = evaluate(tfidf_ranker, GT)
df_tfidf

Unnamed: 0,query,P@1,P@3,P@5,MRR,nDCG@5,latency_ms,pred,sims
0,snare zu laut,0.0,0.333333,0.2,0.333333,0.5,2.477417,"[9, 5, 0, 2, 1]","[0.2896, 0.1791, 0.121, 0.0794, 0.0498]"
1,kick mehr punch,0.0,0.0,0.2,0.25,0.430677,1.083125,"[3, 8, 2, 1, 6]","[0.3109, 0.1237, 0.0577, 0.0568, 0.056]"
2,vocals nasal 800 hz,0.0,0.0,0.2,0.2,0.386853,0.371875,"[7, 1, 9, 0, 2]","[0.4427, 0.1007, 0.0782, 0.0, 0.0]"
3,bass maskiert kick,0.0,0.0,0.2,0.2,0.386853,0.23925,"[8, 0, 1, 2, 3]","[0.3244, 0.0, 0.0, 0.0, 0.0]"
4,s-laute scharf de-esser,0.0,0.333333,0.2,0.5,0.63093,0.897625,"[5, 4, 0, 1, 2]","[0.3386, 0.1068, 0.0, 0.0, 0.0]"


In [5]:
if sbert_ranker is not None:
    df_sbert = evaluate(sbert_ranker, GT)
else:
    df_sbert = pd.DataFrame(columns=df_tfidf.columns)
df_sbert

Unnamed: 0,query,P@1,P@3,P@5,MRR,nDCG@5,latency_ms,pred,sims
0,snare zu laut,0.0,0.0,0.0,0.0,0.0,24.172875,"[5, 1, 2, 7, 3]","[0.604, 0.5759, 0.4062, 0.3972, 0.3829]"
1,kick mehr punch,0.0,0.0,0.0,0.0,0.0,16.65625,"[3, 0, 9, 6, 8]","[0.6462, 0.5553, 0.4025, 0.3059, 0.3031]"
2,vocals nasal 800 hz,0.0,0.0,0.0,0.0,0.0,18.422125,"[7, 1, 5, 9, 6]","[0.6452, 0.578, 0.5177, 0.359, 0.3325]"
3,bass maskiert kick,0.0,0.333333,0.2,0.5,0.63093,16.327625,"[8, 3, 0, 7, 6]","[0.7573, 0.6409, 0.502, 0.4062, 0.3781]"
4,s-laute scharf de-esser,0.0,0.0,0.0,0.0,0.0,18.351708,"[2, 0, 5, 6, 9]","[0.5402, 0.4996, 0.4752, 0.4484, 0.4215]"


## Zusammenfassung (Mittelwerte) & Vergleich

In [6]:
def summarize(df: pd.DataFrame, name: str) -> pd.DataFrame:
    if df.empty:
        return pd.DataFrame([[name, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]],
                            columns=["model","P@1","P@3","P@5","MRR","nDCG@5","latency_ms"])
    m = df[["P@1","P@3","P@5","MRR","nDCG@5","latency_ms"]].mean().to_dict()
    return pd.DataFrame([[name, *[m[c] for c in ["P@1","P@3","P@5","MRR","nDCG@5","latency_ms"]]]],
                        columns=["model","P@1","P@3","P@5","MRR","nDCG@5","latency_ms"])

sum_tfidf = summarize(df_tfidf, "TF-IDF")
sum_sbert = summarize(df_sbert, "SBERT")
pd.concat([sum_tfidf, sum_sbert], ignore_index=True)

Unnamed: 0,model,P@1,P@3,P@5,MRR,nDCG@5,latency_ms
0,TF-IDF,0.0,0.133333,0.2,0.296667,0.467062,1.013858
1,SBERT,0.0,0.066667,0.04,0.1,0.126186,18.786117


## Export (CSV)

In [7]:
out_dir = Path.cwd()/".."/"data"
out_dir.mkdir(parents=True, exist_ok=True)
(out_dir/"eval_week2_tfidf.csv").write_text(df_tfidf.to_csv(index=False), encoding="utf-8")
if not df_sbert.empty:
    (out_dir/"eval_week2_sbert.csv").write_text(df_sbert.to_csv(index=False), encoding="utf-8")
print("Export ->", out_dir)

Export -> /Users/sm/Documents/mixing-forum-analyzer/notebooks/../data


## Sanity-Checks (Top-Treffer ansehen)

In [8]:
def top_strings(pred_idx: List[int]) -> List[str]:
    return [texts[i] for i in pred_idx]

for mname, r in [("TF-IDF", tfidf_ranker), ("SBERT", sbert_ranker)]:
    if r is None: continue
    print("\n==>", mname)
    for q in list(GT.keys())[:3]:
        pred, _ = r.rank(q, k=3)
        print(f"{q!r} ->", top_strings(pred))


==> TF-IDF
'snare zu laut' -> ['Snare zu boxig, 300–500 Hz absenken, transientenfreundlicher Kompressor.', 'Der Gesang hat zu viel S-Laut, De-Esser vor dem Kompressor?', 'Die Kickdrum pumpt im Mix, aber die Snare wirkt zu dünn.']
'kick mehr punch' -> ['Bassdrum und Kickdrum werden oft verwechselt – ich brauche mehr Punch.', 'Sub-Bass ist maskierend, Sidechain von Kick zur Bassspur einrichten.', 'Die Snare klingt trocken und etwas hart, vielleicht mehr Raumanteil.']
'vocals nasal 800 hz' -> ['Gitarren klingen nasal, 800 Hz leicht absenken, dafür 4 kHz etwas anheben.', 'Vocals sitzen zu weit hinten, mehr Präsenz im 3 kHz Bereich.', 'Snare zu boxig, 300–500 Hz absenken, transientenfreundlicher Kompressor.']

==> SBERT
'snare zu laut' -> ['Der Gesang hat zu viel S-Laut, De-Esser vor dem Kompressor?', 'Vocals sitzen zu weit hinten, mehr Präsenz im 3 kHz Bereich.', 'Die Snare klingt trocken und etwas hart, vielleicht mehr Raumanteil.']
'kick mehr punch' -> ['Bassdrum und Kickdrum werden oft