In [1]:
from transformers import *
import librosa
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")
model = Wav2Vec2ForCTC.from_pretrained("r-f/wav2vec-english-speech-emotion-recognition")

loading configuration file preprocessor_config.json from cache at /home1/arpitasa/.cache/huggingface/hub/models--r-f--wav2vec-english-speech-emotion-recognition/snapshots/2c59b3f6092ca069fbf79629eeadd13b9f243ce8/preprocessor_config.json
Feature extractor Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2ProcessorWithLM",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

loading configuration file config.json from cache at /home1/arpitasa/.cache/huggingface/hub/models--r-f--wav2vec-english-speech-emotion-recognition/snapshots/2c59b3f6092ca069fbf79629eeadd13b9f243ce8/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
  "activation_dropout": 0.05,
  "adapter_attn_dim": null,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_a

In [3]:
def extract_features_from_segments(audio_path, segment_length=3.0, hop_length=3.0):
    audio, sr = librosa.load(audio_path, sr=16000)  # 16kHz sample rate
    print("length of audio file = ", librosa.get_duration(y=audio, sr=sr))
    segment_samples = int(segment_length * sr)  # Number of samples per segment

    hop_samples = int(hop_length * sr) 
    
    features = [] 
    
    for start in range(0, len(audio) - segment_samples + 1, hop_samples):
        segment = audio[start:start + segment_samples]
        inputs = feature_extractor(segment, sampling_rate=sr, return_tensors="pt", padding=True)
        
               with torch.no_grad():
                       model_output = model(inputs.input_values, output_hidden_states=True)
            vec = model_output.hidden_states[-1][-1]

            features.append(vec) 
    return np.array(features)

In [4]:
import os
audio_path = "/project/msoleyma_1026/personality_detection/first_impressions_v2_dataset/testing/audio_files"
output_dir = "/project/msoleyma_1026/personality_detection/first_impressions_v2_dataset/testing/audio_feature_vectors"
if not os.path.isdir(output_dir): 
    os.makedirs(output_dir) 

for fold in os.listdir(audio_path): 
    # print(fold) 
    fold_path = os.path.join(audio_path, fold) 
    output_fold = os.path.join(output_dir, fold) 
    if not os.path.isdir(output_fold): 
        os.makedirs(output_fold)
    # print(output_fold) 
    existing = set(os.listdir(output_fold)) 
    if len(os.listdir(output_fold)) == len(os.listdir(fold_path)):
        print(fold, "skipping fold")
        continue
    else: 
        print("starting ", fold)

    for file in os.listdir(fold_path): 
        output_file = os.path.join(output_fold, file[:-4]+".npy")
        # print(output_file)
        if file[:-4]+".npy" in existing: 
            continue 
        else:   
            audio_file = os.path.join(fold_path, file) 
            feature_vector = extract_features_from_segments(audio_file)
            # print(output_file, audio_file, output_fold) 
            np.save(output_file, feature_vector)



test-1 skipping fold
starting  test-2
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.33
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
length of audio file =  15.32
le