# Allgemein

- Common Voice 50h
- Datum 2025-07-28  
- Datensatz: `merged_features_wer_tiny_50.csv`

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# CSV laden
df = pd.read_csv("../../results/subset_50h/merged_features_wer_tiny_50.csv")
df.head(5)

Unnamed: 0,filename,filepath,rms,log_energy,clipping_ratio,crest_factor,snr,hnr,f0,phoneme_entropy,...,mfcc_stat_85,mfcc_stat_86,mfcc_stat_87,mfcc_stat_88,mfcc_stat_89,mfcc_stat_90,mfcc_stat_91,reference,hypothesis,wer
0,common_voice_en_19226640.mp3,audio_files/common_voice_subset_50h/common_voi...,0.138755,7.460141,4.4e-05,7.206935,32.660785,23.016971,100.887721,5.278837,...,-0.727515,0.250416,-0.407458,-0.222167,-0.733382,-0.374494,-1.044447,Poor maintenance has often exacerbated these p...,Port maintenance has often exacerbated these p...,0.142857
1,common_voice_en_21783394.mp3,audio_files/common_voice_subset_50h/common_voi...,0.112247,6.645447,3.3e-05,8.908895,70.466809,22.516427,170.403942,,...,-0.718962,-0.91819,-0.804865,-0.316269,-1.033074,1.359872,-0.074727,The comics were aimed at an older audience.,The comics were aimed at an older audience.,0.0
2,common_voice_en_15904858.mp3,audio_files/common_voice_subset_50h/common_voi...,0.107539,7.020273,5.2e-05,9.298929,30.196857,19.716165,135.333079,5.394557,...,-0.979932,0.090703,0.533803,-0.31558,0.121224,0.145466,-0.263237,"Natalie gave me a pleasing massage, when we we...",Natalie gave me a pleasing massage when we wer...,0.0
3,common_voice_en_23830464.mp3,audio_files/common_voice_subset_50h/common_voi...,0.111006,7.192654,6.5e-05,9.00853,21.029661,18.278107,182.299178,5.082705,...,0.288447,2.098648,-0.11377,0.202279,1.172325,-0.161524,0.2626,Joel is currently performing with The Joel Han...,Joel is currently performing with the Joel Han...,0.111111
4,common_voice_en_23889383.mp3,audio_files/common_voice_subset_50h/common_voi...,0.131413,7.289988,0.000212,7.609585,2.103716,16.080009,203.626716,,...,-0.939715,0.1448,0.467994,0.334807,0.557081,0.246998,-0.255028,FileMaker files are compatible between Mac and...,File Maker files are compatible between Mac an...,0.25


In [15]:
# NaNs und Infs durch 0 ersetzen (alternativ: dropna())
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

# Zielvariable separieren
y = df["wer"]

In [16]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Nur numerische Spalten (ohne Zielvariable und IDs)
exclude_cols = ["filename", "filepath", "reference", "wer"]
X = df.drop(columns=exclude_cols).select_dtypes(include='number')

# MFCC-Statistik-Spalten isolieren
mfcc_cols = [col for col in X.columns if col.startswith("mfcc_stat_")]

# PCA auf MFCC anwenden
scaler_mfcc = StandardScaler()
mfcc_scaled = scaler_mfcc.fit_transform(X[mfcc_cols])

pca = PCA(n_components=10)
mfcc_pca = pca.fit_transform(mfcc_scaled)
pca_cols = [f"pca_mfcc_{i+1}" for i in range(mfcc_pca.shape[1])]
df_pca = pd.DataFrame(mfcc_pca, columns=pca_cols, index=X.index)

# MFCC durch PCA ersetzen
X_reduced = X.drop(columns=mfcc_cols).copy()
X_reduced = pd.concat([X_reduced, df_pca], axis=1)

In [17]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

# Daten skalieren
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_reduced), columns=X_reduced.columns)

# Iterative VIF-Reduktion
def calculate_vif(X):
    return pd.DataFrame({
        "feature": X.columns,
        "VIF": [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    })

max_vif = 10
while True:
    vif_df = calculate_vif(X_scaled)
    max_vif_value = vif_df["VIF"].max()
    
    if max_vif_value <= max_vif:
        break
    
    drop_feature = vif_df.sort_values("VIF", ascending=False).iloc[0]["feature"]
    print(f"- Entferne Feature mit VIF {max_vif_value:.2f}: {drop_feature}")
    X_scaled = X_scaled.drop(columns=[drop_feature])
    X_reduced = X_reduced.drop(columns=[drop_feature])

- Entferne Feature mit VIF 105.16: spectral_entropy
- Entferne Feature mit VIF 72.59: spectral_centroid
- Entferne Feature mit VIF 27.76: spectral_rolloff
- Entferne Feature mit VIF 13.72: hnr
- Entferne Feature mit VIF 11.84: chroma_5
- Entferne Feature mit VIF 11.61: chroma_12
- Entferne Feature mit VIF 11.15: chroma_8


In [19]:
X_reduced["wer"] = y
X_reduced.to_csv("../../results/subset_50h/features_pca_vif.csv", index=False)
print("✅ Reduziertes Feature-Set gespeichert.")

✅ Reduziertes Feature-Set gespeichert.
