# Model Evaluation

# 1. Imports and Setup
## 1.1 Import Libraries

In [None]:
import numpy as np
import pandas as pd
import ast
import os

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

from sentence_transformers import SentenceTransformer

import seaborn as sns
import matplotlib.pyplot as plt

## 1.2 Load Data

In [None]:
GENRE_COL = "tag"

df = pd.read_csv("data/clean/data.csv")

# Filter to English only (same as in your other notebook)
df = df[df["language_cld3"] == "en"].reset_index(drop=True)

# Ensure tokens are lists
if isinstance(df["tokens"].iloc[0], str):
    df["tokens"] = df["tokens"].apply(ast.literal_eval)

df = df.dropna(subset=["tokens", GENRE_COL]).reset_index(drop=True)

print(df[[GENRE_COL, "tokens"]].head())
print("Number of songs:", len(df))

## 1.3 Load Word2Vec-Model

In [None]:
w2v_model = Word2Vec.load("models/word2vec_lyrics.model")
print("Loaded Word2Vec model with vector size:", w2v_model.vector_size)

# 2. Embedding models

In diesem Abschnitt vergleichen wir verschiedene Repräsentationen der Songtexte:

2.1 **Word2Vec (mean)** – Durchschnitt der Wortvektoren pro Song
2.2 **TF-IDF (Bag-of-Words)** – klassische sparse Repräsentation
2.3 **Sentence Transformer** – moderne vortrainierte Text-Embeddings

Für jede Repräsentation trainieren wir eine logistische Regression mit `class_weight='balanced'` und evaluieren sie mit Accuracy, Balanced Accuracy und einer Confusion Matrix.

In [None]:
def plot_confusion_matrix(cm, class_names, title="Confusion Matrix"):
    """
    Nicely formatted confusion matrix using seaborn.
    Expects a normalized matrix (values in [0,1]).
    """
    plt.figure(figsize=(8, 6))
    sns.heatmap(
        cm,
        annot=True,
        fmt=".2f",
        cmap="viridis",          # same look as im text-classification notebook
        xticklabels=class_names,
        yticklabels=class_names,
        square=True,
        cbar_kws={"shrink": 0.8},
    )
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    plt.title(title)
    plt.xticks(rotation=30, ha="right")
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

In [None]:
def train_and_evaluate_representation(name, X, labels, test_size=0.2, random_state=42):
    """
    Train and evaluate a balanced Logistic Regression on given features.
    Returns metrics and trained objects.
    """
    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(labels)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y,
    )

    clf = LogisticRegression(
        max_iter=1000,
        n_jobs=-1,
        class_weight="balanced",
        multi_class="auto",
    )
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    bacc = balanced_accuracy_score(y_test, y_pred)

    print(f"\n=== {name} ===")
    print(f"Accuracy:          {acc:.3f}")
    print(f"Balanced accuracy: {bacc:.3f}\n")
    print("Classification report:\n")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Normalisierte Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    cm_norm = cm.astype("float") / cm.sum(axis=1, keepdims=True)
    plot_confusion_matrix(
        cm_norm,
        class_names=le.classes_,
        title=f"Normalized Confusion Matrix – {name}",
    )

    return {
        "name": name,
        "accuracy": acc,
        "balanced_accuracy": bacc,
        "clf": clf,
        "label_encoder": le,
    }


results = []  # to collect all experiments

## 2.1 Word2Vec (mean)

Hier bilden wir für jeden Song den Durchschnitt aller Word2Vec-Vektoren der enthaltenen Tokens.
Das ergibt einen dichten, niedrigen Dokumentvektor pro Song.

In [None]:
def build_mean_w2v_embeddings(df_in, model, token_col="tokens"):
    """Compute simple mean Word2Vec document embeddings."""
    dim = model.wv.vector_size
    doc_emb = np.zeros((len(df_in), dim), dtype=np.float32)

    for i, tokens in enumerate(df_in[token_col]):
        if not isinstance(tokens, (list, tuple)):
            continue
        vectors = [model.wv[t] for t in tokens if t in model.wv]
        if vectors:
            doc_emb[i] = np.mean(vectors, axis=0).astype(np.float32)

    keep = np.linalg.norm(doc_emb, axis=1) > 0
    return doc_emb[keep], df_in.loc[keep].reset_index(drop=True)


X_w2v_mean, df_w2v_mean = build_mean_w2v_embeddings(df, w2v_model)
labels_w2v_mean = df_w2v_mean[GENRE_COL].astype(str).values

res_w2v_mean = train_and_evaluate_representation(
    "Word2Vec (mean)", X_w2v_mean, labels_w2v_mean
)
results.append(res_w2v_mean)

## 2.2 TF-IDF (Bag-of-Words)

Als klassische Textrepräsentation verwenden wir hier ein TF-IDF-Vektormodell auf Wortebene.
Jeder Song wird durch einen hochdimensionalen, sparse Vektor beschrieben, der die Wichtigkeit von Wörtern im Song relativ zum Korpus widerspiegelt.

In [None]:
# Build plain text strings from tokens
df["lyrics_text"] = df["tokens"].apply(lambda toks: " ".join(toks))

tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,  # optional: limit vocab size
)

X_tfidf = tfidf_vectorizer.fit_transform(df["lyrics_text"])
labels_tfidf = df[GENRE_COL].astype(str).values

# Note: X_tfidf is sparse; LogisticRegression can handle that.
res_tfidf = train_and_evaluate_representation(
    "TF-IDF (bag-of-words)", X_tfidf, labels_tfidf
)
results.append(res_tfidf)

## 2.3 Sentence Transformer

Hier verwenden wir ein vortrainiertes Sentence-Transformer-Modell (`all-MiniLM-L6-v2`),
das ganze Sätze bzw. Dokumente direkt in semantische Vektoren im hochdimensionalen Raum einbettet.
Diese Repräsentation ist deutlich stärker als selbsttrainiertes Word2Vec und reicht oft nah an den Stand der Technik für Textklassifikation heran.


In [None]:
st_model = SentenceTransformer("all-MiniLM-L6-v2")

# We can reuse df["lyrics_text"] from the TF-IDF step
texts_st = df["lyrics_text"].tolist()
labels_st = df[GENRE_COL].astype(str).values

X_st = st_model.encode(
    texts_st,
    show_progress_bar=True,
    convert_to_numpy=True,
)

res_st = train_and_evaluate_representation(
    "SentenceTransformer (all-MiniLM-L6-v2)", X_st, labels_st
)
results.append(res_st)


# 3. Vergleich der Embedding-Modelle

Zum Abschluss vergleichen wir die drei Ansätze anhand der Balanced Accuracy.

In [None]:
model_names = [r["name"] for r in results]
balanced_accs = [r["balanced_accuracy"] for r in results]

plt.figure(figsize=(8, 5))
bars = plt.bar(model_names, balanced_accs)
plt.ylabel("Balanced accuracy")
plt.ylim(0, 1.0)
plt.title("Comparison of embedding models")

# Add value labels on top of bars
for bar, val in zip(bars, balanced_accs):
    plt.text(
        bar.get_x() + bar.get_width() / 2.0,
        bar.get_height() + 0.01,
        f"{val:.3f}",
        ha="center",
        va="bottom",
    )

plt.xticks(rotation=20, ha="right")
plt.tight_layout()
plt.show()