# Text Classification

---

# 1. Imports and Setup
## 1.1 Import Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns  # if you don't want seaborn, we can do pure matplotlib

import joblib
import re

from gensim.models import Word2Vec

## 1.2 Load Embeddings, Lables and Metadata

In [None]:
# Load precomputed song embeddings and labels
X = np.load("data/features/song_embeddings.npy")
y = np.load("data/features/song_labels.npy", allow_pickle=True)

print("X shape:", X.shape)
print("y shape:", y.shape)

# Optional: load metadata for inspection
try:
    df_meta = pd.read_csv("data/features/song_metadata.csv")
    print(df_meta.head())
except FileNotFoundError:
    df_meta = None
    print("No metadata file found.")

## 1.3 Encode labels

In [None]:
# Encode string labels (genres) into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Classes (genres):")
for idx, cls in enumerate(label_encoder.classes_):
    print(f"{idx}: {cls}")

# 2. Model
## 2.1 Train-Test-Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # keep class distribution similar
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

## 2.2 Train Classification-Model (Logistic Regression)

In [None]:
clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    multi_class="auto"
)

clf.fit(X_train, y_train)

print("Training finished.")

## 2.3 Evaluation

In [None]:
y_pred = clf.predict(X_test)

print("Classification report:\n")
print(classification_report(
    y_test,
    y_pred,
    target_names=label_encoder.classes_
))

In [None]:
cm = confusion_matrix(y_test, y_pred)

# normalization per row
cm_norm = cm.astype("float") / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt=".2f",
    cmap="viridis",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
)

plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix (Row-normalized)")
plt.tight_layout()
plt.show()

In [None]:
clf = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight="balanced",
    multi_class="auto"
)
clf.fit(X_train, y_train)

In [None]:
y_pred_bal = clf.predict(X_test)

print("=== Classification report (class_weight='balanced') ===\n")
print(classification_report(
    y_test,
    y_pred_bal,
    target_names=label_encoder.classes_
))


In [None]:
cm_bal = confusion_matrix(y_test, y_pred_bal)
cm_bal_norm = cm_bal.astype("float") / cm_bal.sum(axis=1, keepdims=True)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_bal_norm,
    annot=True,
    fmt=".2f",
    cmap="viridis",
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix (class_weight='balanced')")
plt.tight_layout()
plt.show()
