In [1]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import os

# Step 2: Load the training label file (update the path as needed)
labels_df = pd.read_csv("train_split_Depression_AVEC2017.csv")

# Step 3: Preview the label data
print("Total entries in label file:", len(labels_df))
labels_df.head()


Total entries in label file: 107


Unnamed: 0,Participant_ID,PHQ8_Binary,PHQ8_Score,Gender,PHQ8_NoInterest,PHQ8_Depressed,PHQ8_Sleep,PHQ8_Tired,PHQ8_Appetite,PHQ8_Failure,PHQ8_Concentrating,PHQ8_Moving
0,303,0,0,0,0,0,0.0,0,0,0,0,0
1,304,0,6,0,0,1,1.0,2,2,0,0,0
2,305,0,7,1,0,1,1.0,2,2,1,0,0
3,310,0,4,1,1,1,0.0,0,0,1,1,0
4,312,0,2,1,0,0,1.0,1,0,0,0,0


In [2]:
# Step 1: Choose a patient ID (example: 318)
patient_id = 318

# Step 2: Load COVAREP and FORMANT feature CSVs
covarep_path = f"{patient_id}_COVAREP.csv"
formant_path = f"{patient_id}_FORMANT.csv"

covarep_df = pd.read_csv(covarep_path)
formant_df = pd.read_csv(formant_path)

# Step 3: Merge the two dataframes side by side
merged_df = pd.concat([covarep_df, formant_df], axis=1)

# Step 4: Add the participant ID as a new column
merged_df["Participant_ID"] = patient_id

# Step 5: Fetch and add the PHQ8 score and binary label for the patient
phq_row = labels_df[labels_df["Participant_ID"] == patient_id]
merged_df["PHQ8_Score"] = phq_row["PHQ8_Score"].values[0]
merged_df["PHQ8_Binary"] = phq_row["PHQ8_Binary"].values[0]

# Step 6: Preview the final dataframe for this patient
print(f"Shape of merged feature dataframe for Patient {patient_id}:", merged_df.shape)
merged_df.head()


Shape of merged feature dataframe for Patient 318: (58839, 82)


Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,0.47,0.48,818.07,1799,2622.4,3508.4,4171.6,Participant_ID,PHQ8_Score,PHQ8_Binary
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,675.59,1625.1,2620.0,3510.4,4388.1,318,3,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.37861,0.0,0.0,...,0.0,0.0,849.31,1829.5,3266.4,3952.6,4875.4,318,3,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.3915,0.0,0.0,...,0.0,0.0,915.31,2251.2,3431.1,3984.0,4716.6,318,3,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.39443,0.0,0.0,...,0.0,0.0,862.92,1933.0,3046.8,3675.4,4373.7,318,3,0
4,131.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.38289,0.80062,0.52209,...,-0.1064,-0.26434,651.67,1320.4,2448.7,3343.7,4364.4,318,3,0


In [11]:
# Step 1: Define a list of 5 patient IDs you want to use (ensure these files exist in your environment)
patient_ids = [318, 330, 357]  # Change IDs as per your available data

# Step 2: Create an empty list to store each patient's merged data
all_patients_data = []

# Step 3: Loop through each patient
for pid in patient_ids:
    try:
        # Define file paths
        covarep_path = f"{pid}_COVAREP.csv"
        formant_path = f"{pid}_FORMANT.csv"
        
        # Load feature files
        covarep_df = pd.read_csv(covarep_path)
        formant_df = pd.read_csv(formant_path)
        
        # Merge features horizontally
        merged_df = pd.concat([covarep_df, formant_df], axis=1)
        
        # Add metadata
        merged_df["Participant_ID"] = pid
        phq_row = labels_df[labels_df["Participant_ID"] == pid]
        merged_df["PHQ8_Score"] = phq_row["PHQ8_Score"].values[0]
        merged_df["PHQ8_Binary"] = phq_row["PHQ8_Binary"].values[0]
        
        # Append to list
        all_patients_data.append(merged_df)
        print(f"Processed Patient: {pid}")

    except Exception as e:
        print(f"Error processing patient {pid}: {e}")

# Step 4: Combine all patients' data
combined_df = pd.concat(all_patients_data, ignore_index=True)

# Step 5: Preview final combined dataset
print("Shape of combined dataset:", combined_df.shape)
combined_df.head()


Processed Patient: 318
Processed Patient: 330
Processed Patient: 357
Shape of combined dataset: (177469, 142)


Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,...,-0.050876,0.033902,0.0040198,-0.065422,-0.075505,957.59,1998,2689.9,3871.9,4829.2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.37861,0.0,0.0,...,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.3915,0.0,0.0,...,,,,,,,,,,
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.39443,0.0,0.0,...,,,,,,,,,,
4,131.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.38289,0.80062,0.52209,...,,,,,,,,,,


In [18]:
# Assuming `combined_df` is the final DataFrame from the previous step
# and contains a column `patient_id` which we added earlier

# Load PHQ-8 labels
phq8_df = pd.read_csv("train_split_Depression_AVEC2017.csv")  # Update path if needed
phq8_df = phq8_df[['Participant_ID', 'PHQ8_Score']].rename(columns={'Participant_ID': 'patient_id'})

# Merge labels into the combined dataset
data_labeled = combined_df.merge(phq8_df, on='patient_id', how='inner')

# Filter out unvoiced frames if needed
data_labeled = data_labeled[data_labeled['VUV'] == 1]

# Drop irrelevant columns
X = data_labeled.drop(columns=['patient_id', 'PHQ8_Score'])
y = data_labeled['PHQ8_Score']

print("Features shape:", X.shape)
print("Labels shape:", y.shape)


KeyError: 'patient_id'

In [19]:
print(combined_df.columns)


Index(['0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9',
       ...
       '-0.050876', '0.033902', '0.0040198', '-0.065422', '-0.075505',
       '957.59', '1998', '2689.9', '3871.9', '4829.2'],
      dtype='object', length=142)


In [20]:
# After merging covarep and formant
features['patient_id'] = pid  # Add patient ID to each row


NameError: name 'features' is not defined