In [1]:
# 🔧 Robustes Setup (einmal am Anfang des Notebooks ausführen)
from pathlib import Path
import json

# Finde den /data-Ordner, egal ob du aus notebooks/ oder dem Repo-Root startest
CWD = Path.cwd().resolve()
if (CWD / "data").exists():
    DATA = CWD / "data"
elif (CWD.parent / "data").exists():
    DATA = CWD.parent / "data"
else:
    raise FileNotFoundError("Kein 'data' Ordner gefunden (erwarte ./data oder ../data)")

SAMPLE = DATA / "sample_corpus.json"
assert SAMPLE.exists(), f"Fehlt: {SAMPLE}"

In [2]:
import json, pathlib, re
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

DATA = pathlib.Path("../data/sample_corpus.json")  # Notebook liegt in /notebooks
if DATA.exists():
    texts = json.loads(DATA.read_text(encoding="utf-8"))
else:
    texts = [
        "Die Snare ist zu laut und harsch",
        "Kick zu weich, es fehlt der Punch",
        "Vocals klingen nasal, 800 Hz absenken",
        "Bass maskiert die Kick, Sidechain nötig",
        "S-Laute sind scharf, De-Esser einsetzen",
    ]
len(texts), texts[:3]

(10,
 ['Die Kickdrum pumpt im Mix, aber die Snare wirkt zu dünn.',
  'Vocals sitzen zu weit hinten, mehr Präsenz im 3 kHz Bereich.',
  'Die Snare klingt trocken und etwas hart, vielleicht mehr Raumanteil.'])

In [3]:
cv = CountVectorizer(lowercase=True, ngram_range=(1,2), min_df=1)
Xc = cv.fit_transform(texts)
cv_df = pd.DataFrame(Xc.toarray(), columns=cv.get_feature_names_out())
cv_df.head()

Unnamed: 0,300,300 500,500,500 hz,800,800 hz,aber,aber die,absenken,absenken dafür,...,wirkt,wirkt zu,zu,zu boxig,zu dünn,zu scharf,zu viel,zu weit,zur,zur bassspur
0,0,0,0,0,0,0,1,1,0,0,...,1,1,1,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0


In [4]:
tfidf = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1)
Xt = tfidf.fit_transform(texts)
tfidf_df = pd.DataFrame(Xt.toarray(), columns=tfidf.get_feature_names_out())
tfidf_df.round(3).head()

Unnamed: 0,300,300 500,500,500 hz,800,800 hz,aber,aber die,absenken,absenken dafür,...,wirkt,wirkt zu,zu,zu boxig,zu dünn,zu scharf,zu viel,zu weit,zur,zur bassspur
0,0.0,0.0,0.0,0.0,0.0,0.0,0.228,0.228,0.0,0.0,...,0.228,0.228,0.135,0.0,0.228,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.143,0.0,0.0,0.0,0.0,0.241,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.127,0.0,0.0,0.214,0.0,0.0,0.0,0.0


In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1)
X = tfidf.fit_transform(texts)   # <-- texts statt corpus

# Wichtig: auf der MATRIX X summieren, nicht auf dem Vectorizer
scores = np.asarray(X.sum(axis=0)).ravel()

terms = tfidf.get_feature_names_out()
top = (
    pd.DataFrame({"term": terms, "score": scores})
      .sort_values("score", ascending=False)
      .head(20)
      .reset_index(drop=True)
)
top

Unnamed: 0,term,score
0,zu,0.692978
1,die,0.67954
2,mehr,0.634499
3,snare,0.54393
4,kompressor,0.537229
5,absenken,0.410805
6,hz,0.410805
7,und,0.408975
8,die snare,0.401521
9,etwas,0.398558


In [6]:
GERMAN_STOP = {
    "der","die","das","und","oder","aber","im","in","am","ein","eine","einer",
    "zu","mit","von","für","mehr","ich","du","wir","ihr","man","den","dem"
}
tfidf_de = TfidfVectorizer(lowercase=True, ngram_range=(1,2), min_df=1,
                           stop_words=list(GERMAN_STOP))
Xt_de = tfidf_de.fit_transform(texts)
(pd.DataFrame(Xt_de.toarray(), columns=tfidf_de.get_feature_names_out())
   .round(3).head())

Unnamed: 0,300,300 500,500,500 hz,800,800 hz,absenken,absenken dafür,absenken transientenfreundlicher,anheben,...,vor,vor kompressor,weit,weit hinten,werden,werden oft,wirkt,wirkt dünn,zur,zur bassspur
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.312,0.312,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.28,0.28,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.28,0.28,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
