In [1]:
import torch
import torchaudio
import torchaudio.transforms as T
import pandas as pd


# Function to extract features and save to CSV
def extract_features_to_csv(audio_files, csv_file, sample_rate=16000, future_offset=10):
    mfcc_transform = T.MFCC(sample_rate=sample_rate, n_mfcc=40, 
                            melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 40})
    vad = torchaudio.transforms.Vad(sample_rate=sample_rate)
    
    data_list = []
    for file in audio_files:
        waveform, sr = torchaudio.load(file, format='mp3')
        waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=sample_rate)
        mfcc = mfcc_transform(waveform).squeeze(0).T  # Convert to (time_steps, mfcc_dim)
        vad_labels = vad(waveform).squeeze(0)  # Generate VAD labels
        vad_labels = (vad_labels > 0).float().unsqueeze(1).expand(-1, 2)  # Binary labels
        
        min_length = min(mfcc.shape[0], vad_labels.shape[0])
        if min_length == 0:
            continue  # Skip empty entries
        
        mfcc = mfcc[:min_length, :]
        vad_labels = vad_labels[:min_length, :]
        
        y_projected = torch.zeros_like(vad_labels)
        if min_length > future_offset:
            y_projected[:-future_offset, :] = vad_labels[future_offset:, :]
        else:
            y_projected = vad_labels
        
        data_list.append([mfcc.tolist(), y_projected.tolist()])
    
    df = pd.DataFrame(data_list, columns=['features', 'labels'])
    df.to_csv(csv_file, index=False)
    print(f"Dataset saved to {csv_file}")

In [7]:
from glob import glob
import os
import pandas as pd

data_dir = "/Users/shanujha/Desktop/voice_activity_prediction/voice_data_mozilla/en/clips/"
audio_files = glob(os.path.join(data_dir, "*.mp3"))

if not os.path.exists("../mfcc_extract_csv/"):
    os.makedirs("../mfcc_extract_csv/")

batch_size = 100

batch = 1
for i in range(int(len(audio_files)/batch_size) + 1):
    dataset = extract_features_to_csv(audio_files[i*batch_size:(i+1)*batch_size], f"../mfcc_extract_csv/dataset_{batch}.csv")
    # df = pd.DataFrame(dataset.data, columns=['mfcc', 'label'])
    # df.to_csv(f"processed_data/dataset_{batch}.csv")
    # print(f"Dataset {i} length: {len(dataset)}")
    print(f"Dataset {i} saved successfully!")
    batch += 1


Dataset saved to ../mfcc_extract_csv/dataset_1.csv
Dataset 0 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_2.csv
Dataset 1 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_3.csv
Dataset 2 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_4.csv
Dataset 3 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_5.csv
Dataset 4 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_6.csv
Dataset 5 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_7.csv
Dataset 6 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_8.csv
Dataset 7 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_9.csv
Dataset 8 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_10.csv
Dataset 9 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_11.csv
Dataset 10 saved successfully!
Dataset saved to ../mfcc_extract_csv/dataset_12.csv
Dataset 11 saved successfully!
Dataset saved to ../mfc

In [6]:
if not os.path.exists("../hello"):
    os.makedirs("../hello")

