In [1]:
import os
from pathlib import Path

# Robust project root resolution: aim to have CWD at repo root (where 'data/' lives)
CWD = Path.cwd()
DATA_FILE = "data/sample_corpus.json"

# If data exists here, we're already at project root.
if not (CWD / DATA_FILE).exists():
    # If we're inside notebooks/, go one level up
    if CWD.name == "notebooks" and (CWD.parent / DATA_FILE).exists():
        os.chdir(CWD.parent)
    else:
        # Walk up max 3 levels to find a folder that contains data/sample_corpus.json
        for up in [CWD.parent, CWD.parent.parent, CWD.parent.parent.parent]:
            if up and (up / DATA_FILE).exists():
                os.chdir(up)
                break


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.cluster import KMeans

In [4]:
from sklearn.decomposition import TruncatedSVD

In [5]:
import numpy as np, json, pandas as pd

In [6]:
from pathlib import Path
import json

DATA_PATH = Path("data") / "sample_corpus.json"
if DATA_PATH.exists():
    texts = json.loads(DATA_PATH.read_text(encoding="utf-8"))
else:
    # Fallback mini-corpus so the notebook still runs
    texts = [
        "Die Snare ist zu laut und harsch.",
        "Kick zu weich, es fehlt der Punch.",
        "S-Laute sind scharf, De-Esser einsetzen.",
        "Bass maskiert die Kick, Sidechain nötig.",
        "Vocals klingen nasal, 800 Hz absenken.",
    ]


In [7]:
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)

In [8]:
X = tfidf.fit_transform(texts)

In [9]:
Z = TruncatedSVD(n_components=50, random_state=42).fit_transform(X)

In [10]:
km = KMeans(n_clusters=5, n_init=10, random_state=42).fit(Z)

In [11]:
labels = km.labels_


In [12]:
df = pd.DataFrame({"post_id": range(len(texts)), "text": texts, "cluster": labels})


In [13]:
# Quick inspection
print("Cluster sizes:\n", df["cluster"].value_counts().sort_index().to_string())
df.head()

Cluster sizes:
 cluster
0    1
1    4
2    2
3    2
4    1


Unnamed: 0,post_id,text,cluster
0,0,"Die Kickdrum pumpt im Mix, aber die Snare wirk...",1
1,1,"Vocals sitzen zu weit hinten, mehr Präsenz im ...",1
2,2,"Die Snare klingt trocken und etwas hart, viell...",1
3,3,Bassdrum und Kickdrum werden oft verwechselt –...,1
4,4,"Die Hi-Hats sind zu scharf, ein sanfter Low-Pa...",0


In [14]:
df.to_csv("data/clusters.csv", index=False)