# Text Classification

---

# 1. Imports and Setup
## 1.1 Import Libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score, confusion_matrix

from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt
import seaborn as sns  # if you don't want seaborn, we can do pure matplotlib

import joblib
import re

from gensim.models import Word2Vec

## 1.2 Load Data

In [None]:
df = pd.read_csv("data/clean/data.csv")

texts = df["tokens"]          # your lyrics / tokens column
labels = df["tag"]            # your genre labels

## 1.3 Encode labels

In [None]:
# Encode string labels (genres) into integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(labels)

print("Classes (genres):")
for idx, cls in enumerate(label_encoder.classes_):
    print(f"{idx}: {cls}")

# 2. Model
## 2.1 Train-Test-Split

In [None]:
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    texts,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded,
)

print("Train size:", X_train_texts.shape[0])
print("Test size:", X_test_texts.shape[0])

## 2.2 Build TF-IDF embedding

In [None]:
tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=5,
    max_df=0.9,
)

X_train = tfidf.fit_transform(X_train_texts)   # this is your "embedding"
X_test = tfidf.transform(X_test_texts)

### 2.2.1 Train Classification-Model (LinearSVC)

In [None]:
clf = LinearSVC(class_weight='balanced')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap="Blues",
    xticklabels=clf.classes_,
    yticklabels=clf.classes_
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix – TF-IDF + LinearSVC")
plt.show()

### 2.2.2 Train Classification (Logistic Regression)

In [None]:
clf = LogisticRegression(
    max_iter=2000,          # höher setzen, damit es sicher konvergiert
    n_jobs=-1,              # alle Kerne nutzen
    class_weight="balanced",
    multi_class="auto",
)

clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
cm_norm = cm.astype('float') / cm.sum(axis=1, keepdims=True)

plt.figure()
sns.heatmap(
    cm_norm,
    annot=True,
    fmt='.2f',
    cmap="Blues",
    xticklabels=clf.classes_,
    yticklabels=clf.classes_
)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Normalized Confusion Matrix – TF-IDF + Logistic Regression")
plt.show()

## 2.3 Build Transformer Embedding

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

X_train = model.encode(X_train_texts, show_progress_bar=True)
X_test = model.encode(X_test_texts, show_progress_bar=True)

In [None]:
clf = LinearSVC(class_weight="balanced")
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print("\nClassification report:\n")
print(classification_report(y_test, y_pred))

In [None]:
class_names = sorted(set(labels))

cm = confusion_matrix(y_test, y_pred, labels=class_names)
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm_norm,
    annot=True,
    cmap="Blues",
    fmt=".2f",
    xticklabels=class_names,
    yticklabels=class_names,
)
plt.title("Sentence Transformer + Linear SVC")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.tight_layout()
plt.show()