# Model Evaluation

---

# 1. Imports and Setup
## 1.1 Import Libraries

In [None]:
import numpy as np
import pandas as pd
import ast

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, confusion_matrix


from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt
import seaborn as sns
import torch

import re

from gensim.models import Word2Vec

print(np.__version__)
print(torch.__version__)

## 1.2 Load Data

In [None]:
df = pd.read_csv("data/clean/data.csv")

df["tokens"] = df["tokens"].apply(ast.literal_eval)
texts = df["tokens"]
labels = df["tag"]

print(df['tokens'].head())
print(type(df["tokens"].iloc[0]))

## 1.3 Encode labels

In [None]:
# Encode string labels (genres) into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

print("Classes (genres):")
for idx, cls in enumerate(label_encoder.classes_):
    print(f"{idx}: {cls}")

# 2. Model
## 2.1 Train-Test-Split

In [None]:
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded,
)

print("Train size:", X_train_texts.shape[0])
print("Test size:", X_test_texts.shape[0])

## 2.2 Embedding (Word2Vec)

In [None]:
X_train_tokens = X_train_texts.tolist() # list of lists
X_test_tokens  = X_test_texts.tolist()

In [None]:
w2v = Word2Vec(
    sentences=X_train_tokens,
    vector_size=100,
    window=5,
    min_count=5,
    workers=4,
    sg=1,
    epochs=10,
)

print("Vocabulary size:", len(w2v.wv))

In [None]:
def embed_sentence(tokens, model):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

X_train_emb = np.vstack([embed_sentence(toks, w2v) for toks in X_train_tokens])
X_test_emb  = np.vstack([embed_sentence(toks, w2v) for toks in X_test_tokens])

print("X_train_emb shape:", X_train_emb.shape)
print("X_test_emb shape:", X_test_emb.shape)

### 2.2.1 Train Classification-Model (LinearSVC)

In [None]:
clf_w2v_svc = LinearSVC(class_weight="balanced", max_iter=10000)
clf_w2v_svc.fit(X_train_emb, y_train)

In [None]:
y_pred_w2v_svc = clf_w2v_svc.predict(X_test_emb)

print("=== Word2Vec + LinearSVC ===")
print("Accuracy:", accuracy_score(y_test, y_pred_w2v_svc))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_w2v_svc))
print(classification_report(y_test, y_pred_w2v_svc, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_w2v_svc, labels=label_encoder.transform(label_encoder.classes_))

cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap="Blues",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix – Word2Vec + LinearSVC")
plt.tight_layout()
plt.show()


### 2.2.2 Train Classification-Model (Logistic Regression)

In [None]:
clf_w2v_logreg = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    class_weight="balanced"
)

clf_w2v_logreg.fit(X_train_emb, y_train)

In [None]:
y_pred_w2v_logreg = clf_w2v_logreg.predict(X_test_emb)

print("=== Word2Vec + Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_w2v_logreg))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_w2v_logreg))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_w2v_logreg, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_w2v_logreg, labels=label_encoder.transform(label_encoder.classes_))

cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap="Blues",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix – Word2Vec + Logistic Regression")
plt.tight_layout()
plt.show()

**Word2Vec + LinearSVC**

    - Accuracy: ~0.574
    - Balanced Accuracy: ~0.502

**Word2Vec + Logistic Regression**

    - Accuracy: ~0.465
    - Balanced Accuracy: ~0.555

The Word2Vec experiments show clear differences between the two classifiers. LinearSVC performs best overall, reaching an accuracy of about 57% and a balanced accuracy of 50%. It handles the dominant genres (rap and pop) well, but struggles with smaller classes, which is expected given their limited representation.

Logistic Regression, in contrast, achieves a lower overall accuracy of around 46%, but a slightly higher balanced accuracy of 55%. This indicates that it distributes attention more evenly across genres and performs better on minority classes such as country, misc, and rb. However, it has difficulty separating the larger and semantically similar classes, particularly pop, which reduces its overall performance.

In summary, LinearSVC is the stronger classifier for Word2Vec embeddings, while Logistic Regression provides a more balanced but less accurate alternative.

## 2.3 Embedding (TF-IDF)

In [None]:
X_train_texts_char = X_train_texts.apply(lambda toks: " ".join(toks))
X_test_texts_char  = X_test_texts.apply(lambda toks: " ".join(toks))

In [None]:
tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=5,
    max_df=0.9,
)

X_train_tfidf = tfidf.fit_transform(X_train_texts_char)
X_test_tfidf  = tfidf.transform(X_test_texts_char)

### 2.3.1 Train Classification-Model (LinearSVC)

In [None]:
clf_tfidf_svc = LinearSVC(class_weight='balanced')
clf_tfidf_svc.fit(X_train_tfidf, y_train)

In [None]:
y_pred_tfidf_svc = clf_tfidf_svc.predict(X_test_tfidf)

print("=== TF-IDF + LinearSVC ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf_svc))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_tfidf_svc))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_tfidf_svc, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_tfidf_svc, labels=label_encoder.transform(label_encoder.classes_))
cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap="Blues",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix – TF-IDF + LinearSVC")
plt.tight_layout()
plt.show()

### 2.3.2 Train Classification-Model (Logistic Regression)

In [None]:
clf_tfidf_logreg = LogisticRegression(
    max_iter=2000,
    n_jobs=-1,
    class_weight="balanced"
)

clf_tfidf_logreg.fit(X_train_tfidf, y_train)

In [None]:
y_pred_tfidf_logreg = clf_tfidf_logreg.predict(X_test_tfidf)

print("=== TF-IDF + Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf_logreg))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_tfidf_logreg))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_tfidf_logreg, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_tfidf_logreg, labels=label_encoder.transform(label_encoder.classes_))
cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap="Blues",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix – TF-IDF + Logistic Regression")
plt.tight_layout()
plt.show()

**TF-IDF + LinearSVC**

    - Accuracy: ~0.59
    - Balanced Accuracy: ~0.46

**TF-IDF + Logistic Regression**

    - Accuracy: ~0.55
    - Balanced Accuracy: ~0.53

With TF-IDF embeddings, LinearSVC achieves the highest overall accuracy, driven mainly by strong performance on the dominant genres such as rap and pop. However, its lower balanced accuracy shows that minority genres are classified less reliably.

Logistic Regression performs slightly worse in overall accuracy but achieves a higher balanced accuracy, indicating better handling of smaller genres like country, misc, and rb. It distributes predictions more evenly across classes but struggles with the large and overlapping genres, especially pop.

In summary, LinearSVC offers the best overall performance with TF-IDF, while Logistic Regression provides a more balanced but slightly weaker alternative.

## 2.4 Embedding (Transformer)

SentenceTransformer-models (z.B. all-MiniLM-L6-v2) are based on Transformer-Backbones. These models have an integrated and trained tokenization-pipeline. haben bereits eine integrierte, trainierte Tokenizer-Pipeline.

The command `model.to("cpu")` forces the SentenceTransformer model to run on the CPU instead of the GPU. It is useful when GPU memory is limited or the system becomes unstable during encoding. By moving the model to the CPU, the computations become slower but more stable and require less specialized hardware resources.

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

In [None]:
X_train_sent = [" ".join(toks) for toks in X_train_texts]
X_test_sent  = [" ".join(toks) for toks in X_test_texts]

In [None]:
X_train_emb_st = model.encode(
    X_train_sent,
    batch_size=16,
    show_progress_bar=True,
    convert_to_numpy=False,
    convert_to_tensor=True,
)

X_test_emb_st = model.encode(
    X_test_sent,
    batch_size=16,
    show_progress_bar=True,
    convert_to_numpy=False,
    convert_to_tensor=True,
)

print("Shapes:", X_train_emb_st.shape, X_test_emb_st.shape)

In [None]:
X_train_emb_st = X_train_emb_st.tolist()
X_test_emb_st  = X_test_emb_st.tolist()

### 2.4.1 Train Classification-Model (LinearSVC)

In [None]:
clf_st_svc = LinearSVC(class_weight="balanced", max_iter=10000)
clf_st_svc.fit(X_train_emb_st, y_train)

In [None]:
y_pred_st_svc = clf_st_svc.predict(X_test_emb_st)

print("=== SentenceTransformer (MiniLM) + LinearSVC ===")
print("Accuracy:", accuracy_score(y_test, y_pred_st_svc))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_st_svc))
print("\nClassification report:\n")
print(classification_report(y_test,y_pred_st_svc,
                            labels=np.arange(len(label_encoder.classes_)),
                            target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_st_svc, labels=np.arange(len(label_encoder.classes_)))

cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    cmap="Blues",
    fmt=".2f",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
)
plt.title("SentenceTransformer (MiniLM) + LinearSVC")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

### 2.4.2 Train Classification-Model (Logistic Regression)

In [None]:
clf_st_logreg = LogisticRegression(
    max_iter=3000,
    n_jobs=-1,
    class_weight="balanced",
    multi_class="auto",
)

clf_st_logreg.fit(X_train_emb_st, y_train)

In [None]:
y_pred_st_logreg = clf_st_logreg.predict(X_test_emb_st)

print("=== SentenceTransformer (MiniLM) + Logistic Regression ===")
print("Accuracy:", accuracy_score(y_test, y_pred_st_logreg))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred_st_logreg))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred_st_logreg,
                            labels=np.arange(len(label_encoder.classes_)),
                            target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred_st_logreg, labels=np.arange(len(label_encoder.classes_)))

cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    cmap="Blues",
    fmt=".2f",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
)
plt.title("SentenceTransformer (MiniLM) + Logistic Regression")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()

# 3. Model Comparison Overview

| Embedding | Classifier | Accuracy | Balanced Accuracy | Interpretation |
|-----------|------------|----------|-------------------|----------------|
| **Word2Vec** | LinearSVC | ~0.57 | ~0.50 | Strongest Word2Vec model; good for dominant genres, limited minority class performance |
| **Word2Vec** | Logistic Regression | ~0.46 | ~0.55 | Most balanced across genres; high recall for small classes but low accuracy for major ones |
| **TF-IDF** | LinearSVC | ~0.59 | ~0.46 | Best overall performance for TF-IDF; strong on major genres, weaker on minority classes |
| **TF-IDF** | Logistic Regression | ~0.55 | ~0.53 | More balanced across genres; better recall on smaller classes but weaker on pop |