In [1]:
# Pakete importieren
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Anzeigeoptionen für Pandas erweitern
pd.set_option("display.max_columns", None)

In [2]:
# Pfad zum kombinierten Datensatz
data_path = "../../results/subset_50h/merged_features_wer_tiny_50.csv"

# Daten laden
df = pd.read_csv(data_path)

# Nicht benötigte Spalten entfernen (nur falls vorhanden)
df = df.drop(columns=["filename", "filepath", "reference", "hypothesis"], errors="ignore")

# spectral_contrast in Liste von Floats umwandeln und aufteilen
df["spectral_contrast"] = df["spectral_contrast"].apply(lambda x: list(map(float, x.strip("[]").split())))
spectral_df = pd.DataFrame(df["spectral_contrast"].to_list(), columns=[f"spectral_contrast_{i+1}" for i in range(7)])

# Alte Spalte ersetzen
df = df.drop(columns=["spectral_contrast"])
df = pd.concat([df, spectral_df], axis=1)

# Fehlende Werte entfernen
df = df.dropna()

# Prüfung: sind noch object-Spalten vorhanden?
print("Object-Spalten:", df.select_dtypes(include="object").columns.tolist())
print("NaN-Werte pro Spalte:\n", df.isna().sum().sort_values(ascending=False))

Object-Spalten: []
NaN-Werte pro Spalte:
 rms                    0
log_energy             0
mfcc_stat_66           0
mfcc_stat_65           0
mfcc_stat_64           0
                      ..
mfcc_stat_9            0
mfcc_stat_8            0
mfcc_stat_7            0
mfcc_stat_6            0
spectral_contrast_7    0
Length: 128, dtype: int64


In [3]:
from pycaret.regression import *

# Zielvariable
target = "wer"

# PyCaret Setup (ohne GPU, da LightGBM sonst Warnungen wirft)
regression_setup = setup(
    data=df,
    target=target,
    session_id=42,
    normalize=True,
    train_size=0.8,
    use_gpu=False,  # lieber deaktivieren, siehe Warnungen
    verbose=True
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,wer
2,Target type,Regression
3,Original data shape,"(23290, 128)"
4,Transformed data shape,"(23290, 128)"
5,Transformed train set shape,"(18632, 128)"
6,Transformed test set shape,"(4658, 128)"
7,Numeric features,127
8,Preprocess,True
9,Imputation type,simple


In [4]:
# Vergleicht die besten Modelle
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.1927,0.1071,0.3225,0.1009,0.1945,0.594,2.863
et,Extra Trees Regressor,0.1979,0.1082,0.3243,0.0908,0.1969,0.6108,4.562
br,Bayesian Ridge,0.1982,0.1092,0.3257,0.0832,0.1975,0.6137,0.049
ridge,Ridge Regression,0.1982,0.1094,0.326,0.0811,0.198,0.6237,0.041
lr,Linear Regression,0.1983,0.1094,0.326,0.081,0.1981,0.6249,0.329
omp,Orthogonal Matching Pursuit,0.2004,0.1108,0.3281,0.0692,0.1992,0.6023,0.038
lightgbm,Light Gradient Boosting Machine,0.1946,0.1105,0.3279,0.0689,0.1977,0.5919,0.455
huber,Huber Regressor,0.1853,0.1161,0.3358,0.0252,0.1984,0.562,0.228
rf,Random Forest Regressor,0.2008,0.1162,0.3365,0.0159,0.2013,0.6204,43.071
knn,K Neighbors Regressor,0.1959,0.1184,0.3392,0.0051,0.2053,0.6477,0.07


In [5]:
# Bestehendes Modell analysieren
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…