In [6]:

import os
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2Model


## Wav2Vec2 

In [2]:
DATA_DIR = "pitt_corpus_processed"
FEATURES_DIR = os.path.join(DATA_DIR, "features")
os.makedirs(FEATURES_DIR, exist_ok=True)

MODEL_NAME = "facebook/wav2vec2-base-960h"   # can switch to larger model if GPU memory allows
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Using device: {device}")


✅ Using device: cuda


In [3]:
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to(device)
model.eval()

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [4]:
def extract_wav2vec2_features(file_path):
    """Extract mean-pooled Wav2Vec2 embeddings for a single WAV file."""
    # Load waveform
    speech_array, sr = torchaudio.load(file_path)
    
    # Resample if necessary
    if sr != 16000:
        speech_array = torchaudio.transforms.Resample(sr, 16000)(speech_array)
        sr = 16000
    
    # Convert to mono
    if speech_array.shape[0] > 1:
        speech_array = torch.mean(speech_array, dim=0, keepdim=True)
    
    # Preprocess
    inputs = processor(speech_array.squeeze().numpy(), sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in inputs.items()}
        outputs = model(**inputs)
    
    # Mean pool over time dimension
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()
    return embeddings

In [7]:
metadata = pd.read_csv(os.path.join(DATA_DIR, "segment_metadata.csv"))
features = []
labels = []

print("🚀 Extracting Wav2Vec2 embeddings...")

for idx, row in tqdm(metadata.iterrows(), total=len(metadata)):
    file_path = row["filepath"]
    label = row["label"]
    
    try:
        emb = extract_wav2vec2_features(file_path)
        features.append(emb)
        labels.append(label)
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")


🚀 Extracting Wav2Vec2 embeddings...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|██████████| 6930/6930 [02:05<00:00, 55.04it/s]


In [8]:
# Save Feature Arrays
# ============================================
features_np = np.array(features)
labels_np = np.array(labels)

np.save(os.path.join(FEATURES_DIR, "wav2vec2_features.npy"), features_np)
np.save(os.path.join(FEATURES_DIR, "labels.npy"), labels_np)

# Also save a CSV (flattened embeddings)
df = pd.DataFrame(features_np)
df["label"] = labels_np
df.to_csv(os.path.join(FEATURES_DIR, "wav2vec2_features.csv"), index=False)

print(f"✅ Feature extraction complete!")
print(f"Saved: {os.path.join(FEATURES_DIR, 'wav2vec2_features.npy')}")
print(f"Shape: {features_np.shape}")


✅ Feature extraction complete!
Saved: pitt_corpus_processed\features\wav2vec2_features.npy
Shape: (6930, 768)


## OpenSMILE

In [9]:

import os
import glob
import pandas as pd
import numpy as np
import opensmile
from tqdm import tqdm



In [11]:
DATA_DIR = "pitt_corpus_processed"   # your preprocessed 10-sec audio
FEATURES_DIR = os.path.join(DATA_DIR, "features")
os.makedirs(FEATURES_DIR, exist_ok=True)

# Initialize OpenSMILE with eGeMAPS
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPS,
    feature_level=opensmile.FeatureLevel.Functionals
)


In [12]:
audio_files = glob.glob(os.path.join(DATA_DIR, "**/*.wav"), recursive=True)

features_list = []
labels_list = []

print("🚀 Extracting OpenSMILE eGeMAPS features...")

for file_path in tqdm(audio_files):
    try:
        # Extract features
        feat_df = smile.process_file(file_path)
        feat_vector = feat_df.values.flatten()  # flatten to 1D vector
        
        # Append to list
        features_list.append(feat_vector)
        
        # Label from parent folder
        label = os.path.basename(os.path.dirname(file_path))
        labels_list.append(label)
        
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")


🚀 Extracting OpenSMILE eGeMAPS features...


100%|██████████| 6930/6930 [21:47<00:00,  5.30it/s]


In [13]:
features_np = np.array(features_list)
labels_np = np.array(labels_list)

np.save(os.path.join(FEATURES_DIR, "egemaps_features.npy"), features_np)
np.save(os.path.join(FEATURES_DIR, "egemaps_labels.npy"), labels_np)

# Also save CSV for ML use
df = pd.DataFrame(features_np)
df["label"] = labels_np
df.to_csv(os.path.join(FEATURES_DIR, "egemaps_features.csv"), index=False)

print(f"✅ eGeMAPS feature extraction complete!")
print(f"Shape: {features_np.shape}")
print(f"Saved at: {FEATURES_DIR}")


✅ eGeMAPS feature extraction complete!
Shape: (6930, 88)
Saved at: pitt_corpus_processed\features


## WavLM

In [14]:
!pip install transformers torchaudio soundfile tqdm



In [19]:
import os
import glob
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import WavLMModel, Wav2Vec2FeatureExtractor

In [21]:
DATA_DIR = "pitt_corpus_processed"
FEATURES_DIR = os.path.join(DATA_DIR, "features")
os.makedirs(FEATURES_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

MODEL_NAME = "microsoft/wavlm-base"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
model = WavLMModel.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()



Using device: cuda


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at microsoft/wavlm-base were not used when initializing WavLMModel: ['encoder.pos_conv_embed.conv.weight_g', 'encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing WavLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing WavLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of WavLMModel were not initialized from the model checkpoint at microsoft/wavlm-base and are newly initialized: ['

WavLMModel(
  (feature_extractor): WavLMFeatureEncoder(
    (conv_layers): ModuleList(
      (0): WavLMGroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x WavLMNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x WavLMNoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): WavLMFeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): WavLMEncoder(
    (pos_conv_embed): WavLMPositionalConvEmbedding(
      (conv): Parametrized

In [22]:
audio_files = glob.glob(os.path.join(DATA_DIR, "**/*.wav"), recursive=True)

features_list = []
labels_list = []

print("🚀 Extracting WavLM features...")

for file_path in tqdm(audio_files):
    try:
        # Load audio
        waveform, sr = torchaudio.load(file_path)
        waveform = waveform.mean(dim=0)  # Convert to mono if stereo
        
        # Resample if needed
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(waveform)
        
        # Tokenize
        input_values = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt").input_values
        input_values = input_values.to(DEVICE)
        
        # Forward pass
        with torch.no_grad():
            outputs = model(input_values)
            hidden_states = outputs.last_hidden_state  # shape: [1, seq_len, hidden_dim]
        
        # Aggregate over time dimension (mean pooling)
        feat_vector = hidden_states.mean(dim=1).squeeze().cpu().numpy()
        
        features_list.append(feat_vector)
        
        # Label from parent folder
        label = os.path.basename(os.path.dirname(file_path))
        labels_list.append(label)
        
    except Exception as e:
        print(f"⚠️ Error processing {file_path}: {e}")



🚀 Extracting WavLM features...


100%|██████████| 6930/6930 [02:48<00:00, 41.25it/s]


In [23]:
features_np = np.array(features_list)
labels_np = np.array(labels_list)

np.save(os.path.join(FEATURES_DIR, "wavlm_features.npy"), features_np)
np.save(os.path.join(FEATURES_DIR, "wavlm_labels.npy"), labels_np)

# Save CSV
df = pd.DataFrame(features_np)
df["label"] = labels_np
df.to_csv(os.path.join(FEATURES_DIR, "wavlm_features.csv"), index=False)

print(f"✅ WavLM feature extraction complete!")
print(f"Shape: {features_np.shape}")
print(f"Saved at: {FEATURES_DIR}")


✅ WavLM feature extraction complete!
Shape: (6930, 768)
Saved at: pitt_corpus_processed\features
