In [3]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.utils import shuffle

df_target = pd.read_csv("y-enjoyment.csv")

modality = "audio"  # "text", "audio", "video"

X = []
y = []
person_ids = []
for file in sorted(os.listdir("data/audio-embeddings-vad-300-lg")):
    if file.endswith("csv"):
        # search for number in filename
        match = re.search(r"(\d+)", file)
        # print(match.group(1))
        if match is None:
            continue
        person_id = int(match.group(1))
        if person_id == 26:
            # skip these two persons
            continue
        person_ids.append(person_id)
        df = pd.read_csv(
            os.path.join(f"data/audio-embeddings-vad-300-lg", file), header=0
        )
        X_aux = df.iloc[:, 1:].values
        X_aux = X_aux / np.linalg.norm(X_aux, axis=1, keepdims=True)
        X.append(X_aux)
        y.append(df_target[df_target["user_id"] == person_id]["Average"].values)

# sort by person_id
sorted_indices = np.argsort(person_ids)
X = [X[i] for i in sorted_indices]
y = [y[i] for i in sorted_indices]
person_ids = [person_ids[i] for i in sorted_indices]

X, y, person_ids = shuffle(X, y, person_ids) # Shuffle data with a fixed random seed

y = np.concatenate(y)
print(person_ids)



[21, 17, 13, 18, 34, 31, 28, 30, 42, 23, 12, 14, 33, 41, 39, 27, 24, 35, 29, 8, 9, 37, 11, 6, 38, 22, 7, 25, 16, 36, 40, 19, 5, 4, 10, 15, 32, 20]


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import LeaveOneOut
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler

# Set seed for reproducibility
"""torch.manual_seed(10_000)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(10_000)
np.random.seed(10_000)"""


# Configuration
CONFIG = {
    'learning_rate': 1e-2,        # Reduced learning rate
    'num_epochs': 200,            # Max epochs (early stopping might trigger)
    'batch_size': 37,             # Consider adjusting based on N (e.g., len(train_dataset))
    'attn_hidden_dim': 1,        # Increased attention hidden dimension 16
    'fc_hidden_dim': 512,
    'weight_decay': 1e-2,         # Added L2 regularization
    'dropout_rate': 0.5,          # Dropout probability
    'use_dropout': True,          # Flag to enable/disable dropout True
    'device': torch.device("cuda" if torch.cuda.is_available() else
              "mps" if torch.backends.mps.is_available() else "cpu")
}

# Dataset class (Unchanged)
class SequenceDataset(Dataset):
    def __init__(self, sequences, targets):
        # Ensure sequences are tensors
        self.sequences = [torch.as_tensor(seq, dtype=torch.float) for seq in sequences]
        # Ensure targets are tensors
        self.targets = torch.as_tensor(targets, dtype=torch.float)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# Collate function for variable length sequences (Unchanged)
def collate_fn(batch):
    sequences, targets = zip(*batch)
    # Ensure sequences are tensors before padding
    sequences = [torch.as_tensor(seq, dtype=torch.float) for seq in sequences]
    targets = torch.as_tensor(targets, dtype=torch.float)

    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0.0) # Use 0 for padding
    
    # Create mask: True for padded positions
    mask = torch.zeros(padded_sequences.size(0), padded_sequences.size(1), dtype=torch.bool)
    for i, seq in enumerate(sequences):
        if seq.size(0) < padded_sequences.size(1):
            mask[i, seq.size(0):] = True

    return padded_sequences, targets, mask

# Revised model with optional attention and dropout (Unchanged)
class SequencePredictor(nn.Module):
    def __init__(self, embedding_dim, attn_hidden_dim, fc_hidden_dim, dropout_rate=0.1, use_dropout=True):
        super(SequencePredictor, self).__init__()
        self.use_dropout = use_dropout

        # Attention mechanism
        self.attention = nn.Sequential(
            nn.Linear(embedding_dim, attn_hidden_dim),
            #nn.Tanh(),
            #nn.Linear(attn_hidden_dim, 1)
        )

        # Prediction head
        fc_layers = [
            nn.Linear(embedding_dim, fc_hidden_dim),
            nn.ReLU()
        ]
        if self.use_dropout:
            fc_layers.append(nn.Dropout(dropout_rate)) # Added Dropout

        self.fc_hidden = nn.Sequential(*fc_layers)

        self.output = nn.Linear(fc_hidden_dim, 1)

    def forward(self, x, mask=None):
        # x shape: (batch_size, seq_len, embedding_dim)
        # mask shape: (batch_size, seq_len), True where padded

        # Calculate attention scores
        attn_scores = self.attention(x) # (batch_size, seq_len, 1)
        if mask is not None:
                # mask.unsqueeze(-1) shape: (batch_size, seq_len, 1)
            attn_scores = attn_scores.masked_fill(mask.unsqueeze(-1), float('-inf'))

        # Apply softmax to get attention weights
        attn_weights = F.softmax(attn_scores, dim=1) # (batch_size, seq_len, 1)

        # Apply attention pooling: sum(weights * features)
        # attn_weights * x -> (batch_size, seq_len, embedding_dim)
        pooled_output = torch.sum(attn_weights * x, dim=1) # (batch_size, embedding_dim)

        hidden_representation = self.fc_hidden(pooled_output) # (batch_size, fc_hidden_dim)
        # Store last hidden layer output for later use
        # self.last_pooled_output = hidden_representation.cpu().detach()
    
        # Apply prediction head
        # fc output shape: (batch_size, 1) -> squeeze -> (batch_size,)
        return self.output(hidden_representation).squeeze(-1) # Squeeze last dim

# Revised Training function with early stopping based on train loss (Unchanged)
def train_model(model, train_loader, optimizer, criterion, device, num_epochs):
    model.train() # Set model to training mode (enables dropout)

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for sequences, targets, mask in train_loader:
            sequences, targets, mask = sequences.to(device), targets.to(device), mask.to(device)

            optimizer.zero_grad()
            predictions = model(sequences, mask)
            loss = criterion(predictions, targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            epoch_loss += loss.item() * sequences.size(0)

        epoch_loss /= len(train_loader.dataset)

        if (epoch + 1) % 50 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}: Train Loss = {epoch_loss:.4f}")

    return model

# Revised Evaluation function (Unchanged)
def evaluate_model(model, data_loader, device):
    model.eval() # Set model to evaluation mode (disables dropout)
    true_values_scaled = []
    pred_values_scaled = []

    with torch.no_grad():
        for sequences, targets, mask in data_loader:
            sequences, targets, mask = sequences.to(device), targets.to(device), mask.to(device)
            predictions = model(sequences, mask)

            # Store scaled values
            true_values_scaled.extend(targets.cpu().numpy())
            pred_values_scaled.extend(predictions.cpu().numpy())

    return np.array(true_values_scaled), np.array(pred_values_scaled)

# *** MODIFIED: Leave-One-Out Cross Validation with internal y scaling ***
def run_loocv(X, y_original, config): # Pass ORIGINAL y
    loo = LeaveOneOut()
    device = config['device']
    all_true_orig = []
    all_pred_orig = []

    # --- Determine embedding_dim (Unchanged) ---
    first_seq = X[0]
    if isinstance(first_seq, (list, np.ndarray)):
        first_element = first_seq[0]
        if isinstance(first_element, (list, np.ndarray)):
             embedding_dim = len(first_element)
        elif torch.is_tensor(first_element):
             embedding_dim = first_element.shape[0]
        else:
             embedding_dim = 1
    elif torch.is_tensor(first_seq):
         embedding_dim = first_seq.shape[1]
    else:
        raise ValueError("Could not determine embedding dimension from X")

    fold = 0
    total_folds = len(X)
    y_indices = np.arange(total_folds) # Use indices for splitting X and y

    # Ensure y_original is a numpy array for easier indexing
    y_original = np.asarray(y_original)

    for train_idx, test_idx in loo.split(y_indices):
        fold += 1
        print(f"LOOCV Fold {fold}/{total_folds}")

        # --- Split original data ---
        X_train_fold = [np.array(X[i]) if isinstance(X[i], list) else X[i] for i in train_idx]
        y_train_fold_orig = y_original[train_idx] # Original y values for training set

        X_test_fold = [np.array(X[i]) if isinstance(X[i], list) else X[i] for i in test_idx]
        y_test_fold_orig = y_original[test_idx] # Original y value(s) for test set

        # --- Scale y INSIDE the loop ---
        # Reshape y_train for scaler
        y_train_fold_reshaped = y_train_fold_orig.reshape(-1, 1)

        # Initialize and fit scaler ONLY on training data for this fold
        y_scaler_fold = StandardScaler()
        y_train_scaled = y_scaler_fold.fit_transform(y_train_fold_reshaped).flatten()

        # Transform test data using the FITTED scaler
        # Reshape y_test for transform (even if it's a single value)
        y_test_fold_reshaped = y_test_fold_orig.reshape(-1, 1)
        y_test_scaled = y_scaler_fold.transform(y_test_fold_reshaped).flatten()
        # ----------------------------------

        # Create datasets and dataloaders using SCALED y for this fold
        train_dataset = SequenceDataset(X_train_fold, y_train_scaled)
        # Use the scaled test value for the test dataset
        test_dataset = SequenceDataset(X_test_fold, y_test_scaled)

        # --- Dynamic Batch Size (Unchanged) ---
        train_batch_size = min(config['batch_size'], len(train_dataset))
        if len(train_dataset) == 0:
             print(f"Warning: Fold {fold} has an empty training set. Skipping.")
             continue
        if train_batch_size == 0: train_batch_size = 1

        train_loader = DataLoader(
            train_dataset,
            batch_size=train_batch_size,
            shuffle=True,
            collate_fn=collate_fn
        )
        test_loader = DataLoader(
            test_dataset,
            batch_size=1, # Keep batch size 1 for test in LOOCV
            shuffle=False,
            collate_fn=collate_fn
        )

        # --- Model Initialization (Unchanged) ---
        model = SequencePredictor(
            embedding_dim=embedding_dim,
            attn_hidden_dim=config['attn_hidden_dim'],
            fc_hidden_dim=config['fc_hidden_dim'],
            dropout_rate=config['dropout_rate'],
            use_dropout=config['use_dropout']
        ).to(device)

        optimizer = optim.Adam(
            model.parameters(),
            lr=config['learning_rate'],
        )
        criterion = nn.MSELoss() # Use MSE for scaled targets

        # --- Train Model (Unchanged) ---
        model = train_model(
            model=model,
            train_loader=train_loader,
            optimizer=optimizer,
            criterion=criterion,
            device=device,
            num_epochs=config['num_epochs'],
        )

        # --- Evaluate Model (gets scaled predictions) ---
        # evaluate_model returns the ground truth scaled y from the dataloader
        # and the model's predictions (also scaled)
        _, pred_vals_scaled = evaluate_model(model, test_loader, device)

        # --- Inverse transform predictions and store original values ---
        # We already have the original true value: y_test_fold_orig
        # Inverse transform predictions using the scaler fitted for THIS FOLD
        pred_vals_orig = y_scaler_fold.inverse_transform(pred_vals_scaled.reshape(-1, 1)).flatten()

        print(f"Fold {fold}/{total_folds} for person {person_ids[test_idx[0]]}: True (orig) = {y_test_fold_orig}, Pred (orig) = {pred_vals_orig}")

        # Store original true value(s) and original-scale prediction(s)
        all_true_orig.extend(y_test_fold_orig) # Use the original test value
        all_pred_orig.extend(pred_vals_orig)   # Use the inverse-transformed prediction

        os.makedirs("model-audio", exist_ok=True) # Ensure directory exists
        # Save the model
        model_save_path = f"model-audio/model_person_{person_ids[test_idx[0]]}.pth"
        torch.save(model.state_dict(), model_save_path)
        print(f"Model for fold {fold} saved to {model_save_path}")

    # --- Compute final metrics (Unchanged section, uses collected orig values) ---
    all_true_orig = np.array(all_true_orig)
    all_pred_orig = np.array(all_pred_orig)

    valid_indices = np.isfinite(all_pred_orig)
    if not np.all(valid_indices):
        print(f"Warning: Found {np.sum(~valid_indices)} non-finite predictions. Evaluating metrics only on finite predictions.")
        all_true_orig = all_true_orig[valid_indices]
        all_pred_orig = all_pred_orig[valid_indices]

    if len(all_true_orig) < 2:
         print("Warning: Less than 2 valid prediction pairs. Cannot calculate metrics.")
         results = {
             'r2': np.nan, 'mse': np.nan, 'correlation': np.nan, 'p_value': np.nan,
             'true_values_orig': all_true_orig.tolist(), 'predicted_values_orig': all_pred_orig.tolist()
         }
    else:
        r2 = r2_score(all_true_orig, all_pred_orig)
        mse = mean_squared_error(all_true_orig, all_pred_orig)
        corr, p_value = pearsonr(all_true_orig.flatten(), all_pred_orig)
        # --- Print metrics (Unchanged) ---
        results = {
            'r2': r2, 'mse': mse, 'correlation': corr, 'p_value': p_value,
            'true_values_orig': all_true_orig.tolist(), 'predicted_values_orig': all_pred_orig.tolist()
        }

    return results

# *** MODIFIED: Main execution function ***
def main(X, y, config=None):
    if config is None:
        config = CONFIG

    print(f"Using device: {config['device']}")
    print(f"Configuration: {config}")

    # --- Data Scaling for y REMOVED from here ---
    # Ensure y is suitable for passing (e.g., list or numpy array)
    y = np.asarray(y)

    # Optional: Scale X features here if necessary (remains unchanged)
    print("Input sequences 'X' are assumed to be appropriately scaled/normalized outside the LOOCV loop.")
    print("Target variable 'y' will be scaled within each LOOCV fold.")

    # --- Run LOOCV ---
    # Pass the ORIGINAL y to run_loocv
    results = run_loocv(X, y, config) # Pass original y

    # --- Print Results (Unchanged) ---
    print("\n--- Final LOOCV Results (Original Scale) ---")
    if np.isnan(results['r2']):
         print("Metrics could not be calculated (too few valid predictions).")
    else:
        print(f"R² Score: {results['r2']:.4f}")
        print(f"MSE: {results['mse']:.4f}")
        print(f"Correlation: {results['correlation']:.4f}")
        print(f"P-value: {results['p_value']:.4f}")

    # print(f"True values (original): {results['true_values_orig']}")
    # print(f"Predicted values (original): {results['predicted_values_orig']}")

    # Save results to CSV
    results_df = pd.DataFrame({
        'Person ID': person_ids,
        'True Values': [round(result, 2) for result in results['true_values_orig']],
        'Predicted Values': [round(result, 2) for result in results['predicted_values_orig']]
    })
    # Order by Person ID
    results_df.sort_values(by='Person ID', inplace=True)
    results_df.to_csv('predictions-audio-attention.csv', index=False)

    return results

# Example usage (replace with your actual data)
if __name__ == "__main__":

    # --- Run the main function ---
    results = main(X, y)

Using device: mps
Configuration: {'learning_rate': 0.01, 'num_epochs': 200, 'batch_size': 37, 'attn_hidden_dim': 1, 'fc_hidden_dim': 512, 'weight_decay': 0.01, 'dropout_rate': 0.5, 'use_dropout': True, 'device': device(type='mps')}
Input sequences 'X' are assumed to be appropriately scaled/normalized outside the LOOCV loop.
Target variable 'y' will be scaled within each LOOCV fold.
LOOCV Fold 1/38
Epoch 50/200: Train Loss = 0.5391
Epoch 100/200: Train Loss = 0.3940
Epoch 150/200: Train Loss = 0.2879
Epoch 200/200: Train Loss = 0.2009
Fold 1/38 for person 21: True (orig) = [6.71428571], Pred (orig) = [5.516598]
Model for fold 1 saved to model-audio/model_person_21.pth
LOOCV Fold 2/38
Epoch 50/200: Train Loss = 0.7504
Epoch 100/200: Train Loss = 0.3432
Epoch 150/200: Train Loss = 0.2927
Epoch 200/200: Train Loss = 0.2281
Fold 2/38 for person 17: True (orig) = [6.85714286], Pred (orig) = [6.095297]
Model for fold 2 saved to model-audio/model_person_17.pth
LOOCV Fold 3/38
Epoch 50/200: Tra