In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
ACOUSTIC_PATH = "/content/drive/MyDrive/E-DAIC_Acoustics"
LABELS_PATH = "/content/drive/MyDrive/DepressionLabels.xlsx"

In [3]:
import os
import pandas as pd
import numpy as np

import pandas as pd

labels = pd.read_excel(LABELS_PATH)

# Rename to a consistent name
labels = labels.rename(columns={"Participant_ID": "participant"})

# Convert to string so it matches acoustic filenames
labels["participant"] = labels["participant"].astype(str)

label_ids = set(labels["participant"])

print("Loaded labels:", len(label_ids))
labels.head()

Loaded labels: 219


Unnamed: 0,participant,PHQ_Score
0,300,2
1,301,3
2,302,4
3,303,0
4,304,6


In [4]:
import glob
import os

# Replace with your actual path to E-DAIC_Acoustics folder
ACOUSTICS_PATH = "/content/drive/MyDrive/E-DAIC_Acoustics"

# Get all acoustic CSV files
acoustic_files = {}

for f in glob.glob(os.path.join(ACOUSTICS_PATH, "*.csv")):
    # Extract participant ID from filename (assumes "300_eGeMAPS.csv" format)
    pid = os.path.basename(f).split('_')[0]
    acoustic_files[pid] = f

acoustic_ids = set(acoustic_files.keys())

# Participants with both labels and acoustic data
valid_ids = label_ids.intersection(acoustic_ids)

print("Total acoustic files:", len(acoustic_ids))
print("Participants with both labels + acoustic data:", len(valid_ids))
print("Sample valid IDs:", sorted(list(valid_ids))[:10])

Total acoustic files: 190
Participants with both labels + acoustic data: 134
Sample valid IDs: ['386', '387', '388', '389', '390', '391', '392', '393', '395', '396']


In [5]:
import pandas as pd

feature_rows = []

for pid in valid_ids:
    csv_path = acoustic_files[pid]
    df = pd.read_csv(csv_path)

    # Keep only numeric columns (remove text like 'Turn', 'Text')
    df_numeric = df.select_dtypes(include='number')

    # Aggregate: mean, std, median
    mean_feats = df_numeric.mean().add_suffix("_mean")
    std_feats = df_numeric.std().add_suffix("_std")
    median_feats = df_numeric.median().add_suffix("_median")

    # Combine into a single Series
    agg_feats = pd.concat([mean_feats, std_feats, median_feats])
    agg_feats["participant"] = pid

    feature_rows.append(agg_feats)

# Create DataFrame
acoustic_df = pd.DataFrame(feature_rows)
acoustic_df = acoustic_df.set_index("participant")
acoustic_df.head()

Unnamed: 0_level_0,ParticipantID_mean,UtteranceIndex_mean,Start_Time_mean,End_Time_mean,Confidence_mean,Loudness_sma3_mean,alphaRatio_sma3_mean,hammarbergIndex_sma3_mean,slope0-500_sma3_mean,slope500-1500_sma3_mean,...,HNRdBACF_sma3nz_median,logRelF0-H1-H2_sma3nz_median,logRelF0-H1-A3_sma3nz_median,F1frequency_sma3nz_median,F1bandwidth_sma3nz_median,F1amplitudeLogRelF0_sma3nz_median,F2frequency_sma3nz_median,F2amplitudeLogRelF0_sma3nz_median,F3frequency_sma3nz_median,F3amplitudeLogRelF0_sma3nz_median
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
452,452.0,53.0,445.300952,455.620952,0.941428,0.102239,-14.604514,25.005601,0.016337,0.001389,...,3.140627,2.273898,8.997985,628.163836,1408.28342,-131.820324,1605.922125,-136.946445,2557.047327,-138.595648
450,450.0,65.5,622.196154,628.253077,0.953921,0.158566,-14.538353,25.904828,0.000379,-0.000808,...,3.250364,1.146117,11.011861,625.856342,1419.736542,-116.623163,1591.902766,-121.718469,2575.153026,-126.63585
391,391.0,41.0,339.540741,342.588889,0.875977,0.131829,-18.060763,28.625438,0.004,0.004773,...,4.124039,3.148089,9.585982,641.397473,1402.995801,-124.467745,1638.561938,-130.393962,2609.575344,-133.485549
453,453.0,50.5,514.075,528.99,0.90653,0.070965,-17.408578,29.682919,-0.011188,0.008133,...,1.012639,1.769582,9.0206,576.141322,1400.020685,-137.819976,1533.522793,-142.313899,2466.456443,-145.083158
402,402.0,52.5,473.346154,477.798077,0.939062,0.102933,-16.49824,29.196915,-0.003071,0.001464,...,3.898139,1.670403,11.224732,667.677781,1411.434908,-113.715551,1623.40841,-119.267969,2640.020851,-124.320795


In [6]:
# Keep only participants with valid acoustic data
data = acoustic_df.merge(labels, left_index=True, right_on="participant")
data = data.set_index("participant")

print("Final dataset shape:", data.shape)
data.head()

Final dataset shape: (134, 85)


Unnamed: 0_level_0,ParticipantID_mean,UtteranceIndex_mean,Start_Time_mean,End_Time_mean,Confidence_mean,Loudness_sma3_mean,alphaRatio_sma3_mean,hammarbergIndex_sma3_mean,slope0-500_sma3_mean,slope500-1500_sma3_mean,...,logRelF0-H1-H2_sma3nz_median,logRelF0-H1-A3_sma3nz_median,F1frequency_sma3nz_median,F1bandwidth_sma3nz_median,F1amplitudeLogRelF0_sma3nz_median,F2frequency_sma3nz_median,F2amplitudeLogRelF0_sma3nz_median,F3frequency_sma3nz_median,F3amplitudeLogRelF0_sma3nz_median,PHQ_Score
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
452,452.0,53.0,445.300952,455.620952,0.941428,0.102239,-14.604514,25.005601,0.016337,0.001389,...,2.273898,8.997985,628.163836,1408.28342,-131.820324,1605.922125,-136.946445,2557.047327,-138.595648,1
450,450.0,65.5,622.196154,628.253077,0.953921,0.158566,-14.538353,25.904828,0.000379,-0.000808,...,1.146117,11.011861,625.856342,1419.736542,-116.623163,1591.902766,-121.718469,2575.153026,-126.63585,9
391,391.0,41.0,339.540741,342.588889,0.875977,0.131829,-18.060763,28.625438,0.004,0.004773,...,3.148089,9.585982,641.397473,1402.995801,-124.467745,1638.561938,-130.393962,2609.575344,-133.485549,9
453,453.0,50.5,514.075,528.99,0.90653,0.070965,-17.408578,29.682919,-0.011188,0.008133,...,1.769582,9.0206,576.141322,1400.020685,-137.819976,1533.522793,-142.313899,2466.456443,-145.083158,17
402,402.0,52.5,473.346154,477.798077,0.939062,0.102933,-16.49824,29.196915,-0.003071,0.001464,...,1.670403,11.224732,667.677781,1411.434908,-113.715551,1623.40841,-119.267969,2640.020851,-124.320795,11


In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
import numpy as np
import pandas as pd

# --- Absolute Relative Error ---
def absolute_relative_error(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true) / np.max(y_true))

# --- Simple k-fold experiment to choose best k ---
def run_kfold_experiment(data, k=5, max_depth=5):
    X = data.drop(columns=['PHQ_Score']).values
    y = data['PHQ_Score'].values

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    r_list, re_list = [], []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Skip if predictions are constant
        if len(np.unique(y_pred)) < 2:
            continue

        r, _ = pearsonr(y_test, y_pred)
        re = absolute_relative_error(y_test, y_pred)

        r_list.append(r)
        re_list.append(re)

    return np.mean(r_list), np.mean(re_list)

# --- Post-pruning k-fold experiment ---
def run_kfold_with_pruning(data, k=5, min_samples_leaf=5):
    X = data.drop(columns=['PHQ_Score']).values
    y = data['PHQ_Score'].values
    feature_names = data.drop(columns=['PHQ_Score']).columns

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    r_list, re_list = [], []
    top_features_list = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # 1) Cost-complexity pruning path
        temp_dt = DecisionTreeRegressor(random_state=42)
        temp_dt.fit(X_train, y_train)
        ccp_alphas = temp_dt.cost_complexity_pruning_path(X_train, y_train).ccp_alphas

        best_r_fold, best_model = -999, None

        # 2) Train trees with different alphas
        for ccp in ccp_alphas:
            dt = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp, min_samples_leaf=min_samples_leaf)
            dt.fit(X_train, y_train)
            y_pred = dt.predict(X_test)

            if len(np.unique(y_pred)) < 2:
                continue

            r, _ = pearsonr(y_test, y_pred)
            if r > best_r_fold:
                best_r_fold = r
                best_model = dt

        if best_model is None:
            best_model = temp_dt

        # 3) Evaluate best model on test
        y_pred = best_model.predict(X_test)
        r, _ = pearsonr(y_test, y_pred)
        re = absolute_relative_error(y_test, y_pred)

        r_list.append(r)
        re_list.append(re)

        # 4) Collect top features
        feature_importances = pd.Series(best_model.feature_importances_, index=feature_names)
        top_features_list.append(feature_importances.sort_values(ascending=False).head(10))

    avg_r = np.mean(r_list)
    avg_re = np.mean(re_list)

    # Aggregate top features across folds
    top_features_overall = pd.concat(top_features_list, axis=1).mean(axis=1).sort_values(ascending=False).head(10)

    return avg_r, avg_re, top_features_overall

# --- Step 1: Find best k automatically ---
k_values = [3, 5, 10]
best_r_k = -999
best_k = None

for k in k_values:
    r, re = run_kfold_experiment(data, k=k)
    print(f"Simple CV | k={k}: Pearson r={r:.3f}, Absolute RE={re:.3f}")
    if r > best_r_k:
        best_r_k = r
        best_k = k

print(f"\nSelected best k based on highest Pearson r: k={best_k}")

# --- Step 2: Post-pruning with best k ---
r_final, re_final, top_features = run_kfold_with_pruning(data, k=best_k)
print(f"\nPost-pruning with k={best_k}: Pearson r={r_final:.3f}, Absolute RE={re_final:.3f}")
print("\nTop informative acoustic features:")
print(top_features)

Simple CV | k=3: Pearson r=0.354, Absolute RE=0.237
Simple CV | k=5: Pearson r=0.049, Absolute RE=0.297
Simple CV | k=10: Pearson r=0.050, Absolute RE=0.338

Selected best k based on highest Pearson r: k=3

Post-pruning with k=3: Pearson r=0.196, Absolute RE=0.233

Top informative acoustic features:
mfcc4_sma3_median               0.316434
slope500-1500_sma3_median       0.309696
Start_Time_mean                 0.308123
jitterLocal_sma3nz_median       0.280371
End_Time_mean                   0.275501
End_Time_median                 0.238008
F3frequency_sma3nz_mean         0.126178
jitterLocal_sma3nz_std          0.103290
shimmerLocaldB_sma3nz_median    0.088629
ParticipantID_mean              0.067977
dtype: float64


In [14]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
import numpy as np

# Function to calculate Absolute Relative Error
def absolute_relative_error(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true) / np.max(y_true))

# Function to run k-fold cross-validation
def run_kfold_experiment(data, k=5, max_depth=5):
    X = data.drop(columns=['PHQ_Score']).values
    y = data['PHQ_Score'].values

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    r_list = []
    re_list = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        r, _ = pearsonr(y_test, y_pred)
        re = absolute_relative_error(y_test, y_pred)

        r_list.append(r)
        re_list.append(re)

    return np.mean(r_list), np.mean(re_list)
for k in [3, 5, 10]:
    r, re = run_kfold_experiment(data, k=k, max_depth=5)
    print(f"k={k}: Pearson r={r:.3f}, Absolute RE={re:.3f}")
X = data.drop(columns=['PHQ_Score'])
y = data['PHQ_Score']

model = DecisionTreeRegressor(max_depth=5, random_state=42)
model.fit(X, y)

importances = pd.Series(model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(10)
print("Top 10 acoustic features influencing PHQ-Score:")
print(top_features)

k=3: Pearson r=0.354, Absolute RE=0.237
k=5: Pearson r=0.049, Absolute RE=0.297
k=10: Pearson r=0.050, Absolute RE=0.338
Top 10 acoustic features influencing PHQ-Score:
UtteranceIndex_std                 0.207032
End_Time_median                    0.127931
Loudness_sma3_mean                 0.126359
slope500-1500_sma3_median          0.124740
hammarbergIndex_sma3_median        0.081895
F2frequency_sma3nz_std             0.074433
F1frequency_sma3nz_std             0.068324
Loudness_sma3_median               0.036422
F1amplitudeLogRelF0_sma3nz_mean    0.032664
ParticipantID_median               0.030341
dtype: float64


In [15]:
# List of most important acoustic features for PHQ-8 prediction
# These features capture temporal aspects of speech, voice quality, and spectral characteristics
# that have been found to be informative for estimating depression severity.
important_acoustic_features = [
    "Start_Time_mean",           # Temporal: slower speech onset may indicate psychomotor slowing
    "End_Time_mean",             # Temporal: turn end timing alterations in depressed speech
    "End_Time_median",           # Temporal: robust measure of utterance duration
    "UtteranceIndex_std",        # Temporal: variation in pacing across utterances
    "slope500-1500_sma3_median",# Spectral/Energy slope: monotone speech patterns
    "jitterLocal_sma3nz_median",# Voice quality: microfrequency variations
    "jitterLocal_sma3nz_std",   # Voice quality: variability in pitch stability
    "shimmerLocal_sma3nz_median",# Voice quality: amplitude variations
    "mfcc4_sma3_median",        # Spectral/MFCC: timbre and articulation changes
    "F3frequency_sma3nz_mean"   # Pitch/Formant: articulation and resonance changes
]

# You can later use this list to filter your dataset for feature selection or analysis.