In [2]:
import os
import pandas as pd
import numpy as np
from google.colab import files
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from tqdm import tqdm

In [2]:
!curl -L -o /content/common-voice.zip https://www.kaggle.com/api/v1/datasets/download/mozillaorg/common-voice

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 12.0G  100 12.0G    0     0  90.3M      0  0:02:16  0:02:16 --:--:-- 64.5M


In [3]:
!unzip "/content/common-voice.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: cv-valid-train/cv-valid-train/sample-190776.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190777.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190778.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190779.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190780.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190781.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190782.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190783.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190784.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190785.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190786.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190787.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190788.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190789.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190

In [4]:
!rm "/content/common-voice.zip"

In [5]:
data1 = pd.read_csv("/content/cv-valid-train.csv")
data2 = pd.read_csv("/content/cv-valid-test.csv")
data3 = pd.read_csv("/content/cv-valid-dev.csv")
data4 = pd.read_csv("/content/cv-other-train.csv")
data5 = pd.read_csv("/content/cv-other-test.csv")
data6 = pd.read_csv("/content/cv-other-dev.csv")
data7 = pd.read_csv("/content/cv-invalid.csv")

In [6]:
df = pd.concat([
    data1,
    data2,
    data3,
    data4,
    data5,
    data6,
    data7
])

In [7]:
df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,
...,...,...,...,...,...,...,...,...
25398,cv-invalid/sample-025398.mp3,well then we've got a problem,0,4,,,,
25399,cv-invalid/sample-025399.mp3,the boy was surprised at his thoughts,0,6,,,,
25400,cv-invalid/sample-025400.mp3,undefined,1,2,,,,
25401,cv-invalid/sample-025401.mp3,but there was something there in his heart tha...,1,5,,,,


# Feature Extraction Begins Here

In [8]:
# Set up GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [9]:
from concurrent.futures import ThreadPoolExecutor


In [10]:
# You may also want to verify you're using GPU in Colab specifically
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")

GPU Name: Tesla T4
GPU Memory: 15.828320256 GB


In [11]:
# Optimized Transform Initialization (Do this ONCE)
SAMPLE_RATE = 16000  # Common sample rate for speech
N_FFT = 1024         # Reduced from 2048 for speed
WIN_LENGTH = 512      # Fixed window size
HOP_LENGTH = 256      # Larger hop = fewer frames
N_MELS = 64           # Reduced from 128 to avoid warning

# Pre-initialize transforms on GPU
mel_transform = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    win_length=WIN_LENGTH,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
).to(device)

mfcc_transform = T.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=13,
    melkwargs={
        'n_fft': N_FFT,
        'win_length': WIN_LENGTH,
        'hop_length': HOP_LENGTH,
        'n_mels': N_MELS
    }
).to(device)

spectrogram_transform = T.Spectrogram(
    n_fft=N_FFT,
    win_length=WIN_LENGTH,
    hop_length=HOP_LENGTH,
    power=2
).to(device)


In [12]:

def extract_audio_features(audio_path):
    try:
        with torch.no_grad():  # Critical for speed
            # Load and preprocess audio
            waveform, sample_rate = torchaudio.load(audio_path)
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            waveform = waveform.squeeze().to(device)

            # Resample if needed
            if sample_rate != SAMPLE_RATE:
                waveform = torchaudio.functional.resample(waveform, sample_rate, SAMPLE_RATE)
                sample_rate = SAMPLE_RATE

            # Skip very short files
            if len(waveform) < WIN_LENGTH:
                return None

            # Extract features
            features = {'duration': len(waveform)/sample_rate}

            # Mel Spectrogram (pre-initialized)
            mel_spec = mel_transform(waveform)
            features.update({
                'mel_energy_mean': mel_spec.mean().item(),
                'mel_energy_std': mel_spec.std().item()
            })

            # MFCCs (pre-initialized)
            mfccs = mfcc_transform(waveform).squeeze()
            for i in range(13):
                features.update({
                    f'mfcc{i+1}_mean': mfccs[i].mean().item(),
                    f'mfcc{i+1}_std': mfccs[i].std().item()
                })

            # Spectrogram-based features
            spec = spectrogram_transform(waveform)
            freqs = torch.linspace(0, sample_rate//2, spec.shape[1], device=device)

            # Spectral Centroid
            spec_sum = spec.sum(dim=1)
            spec_weighted = (spec * freqs.unsqueeze(0)).sum(dim=1)
            spectral_centroid = spec_weighted / (spec_sum + 1e-8)
            features.update({
                'spectral_centroid_mean': spectral_centroid.mean().item(),
                'spectral_centroid_std': spectral_centroid.std().item()
            })

            # Simplified RMS Energy
            frames = waveform.unfold(0, WIN_LENGTH, HOP_LENGTH)
            rms = torch.sqrt(torch.mean(frames**2, dim=1))
            features.update({
                'rms_mean': rms.mean().item(),
                'rms_std': rms.std().item()
            })

            return features

    except Exception as e:
        print(f"Error with {os.path.basename(audio_path)}: {str(e)}")
        return None

In [13]:

def process_batch(file_batch):
    return [extract_audio_features(f) for f in file_batch]


In [14]:
def main(df, root_dir, batch_size=32, workers=4):
    # Prepare file list with ALL metadata
    file_entries = []
    for _, row in df.iterrows():
        parts = row['filename'].split('/')
        if len(parts) >= 2:
            path = os.path.join(root_dir, parts[0], parts[0], '/'.join(parts[1:]))
        else:
            path = os.path.join(root_dir, row['filename'])

        if os.path.exists(path):
            # Store both path AND all metadata
            file_entries.append({
                'path': path,
                'metadata': row.to_dict()  # Save all original columns
            })

    # Process in parallel batches
    features_list = []
    batch_count = (len(file_entries) + batch_size - 1) // batch_size

    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = []
        for i in range(0, len(file_entries), batch_size):
            batch = file_entries[i:i+batch_size]
            futures.append(executor.submit(
                process_batch,
                [entry['path'] for entry in batch]
            ))

        for future_idx, future in enumerate(tqdm(futures, total=len(futures), desc="Processing batches")):
            batch_start = future_idx * batch_size
            batch_results = future.result()

            for entry_idx, features in enumerate(batch_results):
                if features:
                    # Get the corresponding metadata
                    original_idx = batch_start + entry_idx
                    metadata = file_entries[original_idx]['metadata']

                    # Merge features with ALL metadata
                    full_record = {**metadata, **features}
                    features_list.append(full_record)

    return pd.DataFrame(features_list)

In [15]:
root_dir = "/content/"

In [16]:
features_df = main(df, root_dir)

Processing batches:  99%|█████████▊| 11725/11887 [1:00:23<12:24,  4.60s/it]

Error with sample-020283.mp3: CUDA out of memory. Tried to allocate 3.46 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.28 GiB is free. Process 13889 has 11.46 GiB memory in use. Of the allocated memory 8.89 GiB is allocated by PyTorch, and 2.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


Processing batches: 100%|██████████| 11887/11887 [1:01:19<00:00,  3.23it/s]


In [17]:
features_df.to_csv("audio_features_optimized_13mfcc_gender", index=False)

In [18]:
features_df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration,mel_energy_mean,mel_energy_std,...,mfcc11_mean,mfcc11_std,mfcc12_mean,mfcc12_std,mfcc13_mean,mfcc13_std,spectral_centroid_mean,spectral_centroid_std,rms_mean,rms_std
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,4.104,1.019638e+01,5.594743e+01,...,-5.997583,9.479365,-8.241420,8.680332,-12.179601,8.658770,5025.709473,1555.652222,0.061276,5.431069e-02
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,7.560,9.113991e-03,9.059318e-02,...,-4.434360,5.779143,-2.137226,5.747810,-4.448286,6.351164,3236.186768,881.136292,0.001168,2.152560e-03
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,5.064,4.557527e+00,5.391312e+01,...,-2.593902,10.273657,-12.882552,9.743185,-0.677140,6.186485,3607.865967,909.344727,0.038856,3.831615e-02
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,6.648,2.014356e-01,2.110250e+00,...,-5.088724,7.329194,-4.894209,7.846473,-4.213955,6.762061,4466.413574,936.831604,0.007066,9.041587e-03
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,4.896,3.065309e+02,1.864373e+03,...,-8.902140,7.888978,-11.091557,8.974220,-7.147424,8.081219,2978.172607,872.457275,0.369546,2.548887e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380362,cv-invalid/sample-025398.mp3,well then we've got a problem,0,4,,,,5.136,8.433924e-09,3.057379e-08,...,-0.784522,2.043503,0.316677,1.458731,-0.803339,1.332724,2980.936279,787.610046,0.000002,8.673126e-07
380363,cv-invalid/sample-025399.mp3,the boy was surprised at his thoughts,0,6,,,,3.816,8.676028e-02,6.584325e-01,...,-2.253586,5.411062,-1.232574,5.592003,-7.458000,5.937366,3907.376709,305.468781,0.003608,6.633312e-03
380364,cv-invalid/sample-025400.mp3,undefined,1,2,,,,6.744,4.251849e+00,5.320736e+01,...,-9.728217,8.204746,-3.846340,6.451725,-7.610143,8.534258,4074.066162,1017.008423,0.028675,4.430717e-02
380365,cv-invalid/sample-025401.mp3,but there was something there in his heart tha...,1,5,,,,5.088,1.435417e+00,4.191814e+01,...,4.728322,9.431586,-0.891293,8.314837,-6.266241,8.177511,6190.493164,1438.672363,0.010429,2.899921e-02


In [5]:
columns_to_drop = ['up_votes','down_votes', 'age', 'accent', 'duration','text']
df_dropped = features_df.drop(columns=columns_to_drop)

In [6]:
df_dropped

Unnamed: 0,filename,gender,mel_energy_mean,mel_energy_std,mfcc1_mean,mfcc1_std,mfcc2_mean,mfcc2_std,mfcc3_mean,mfcc3_std,...,mfcc11_mean,mfcc11_std,mfcc12_mean,mfcc12_std,mfcc13_mean,mfcc13_std,spectral_centroid_mean,spectral_centroid_std,rms_mean,rms_std
0,cv-valid-train/sample-000000.mp3,,1.019638e+01,5.594743e+01,-94.578201,99.582184,54.950005,47.483868,-17.141302,33.086834,...,-5.997583,9.479365,-8.241420,8.680332,-12.179601,8.658770,5025.709473,1555.652222,0.061276,5.431069e-02
1,cv-valid-train/sample-000001.mp3,,9.113991e-03,9.059318e-02,-399.950836,73.564545,30.894300,41.812756,7.586493,15.731482,...,-4.434360,5.779143,-2.137226,5.747810,-4.448286,6.351164,3236.186768,881.136292,0.001168,2.152560e-03
2,cv-valid-train/sample-000002.mp3,,4.557527e+00,5.391312e+01,-140.311890,86.915146,25.248392,53.395462,23.602057,23.483746,...,-2.593902,10.273657,-12.882552,9.743185,-0.677140,6.186485,3607.865967,909.344727,0.038856,3.831615e-02
3,cv-valid-train/sample-000003.mp3,,2.014356e-01,2.110250e+00,-280.108795,63.318810,45.826618,42.047878,3.654769,20.959541,...,-5.088724,7.329194,-4.894209,7.846473,-4.213955,6.762061,4466.413574,936.831604,0.007066,9.041587e-03
4,cv-valid-train/sample-000004.mp3,,3.065309e+02,1.864373e+03,40.750916,70.305367,40.290054,31.889565,-1.816274,18.777185,...,-8.902140,7.888978,-11.091557,8.974220,-7.147424,8.081219,2978.172607,872.457275,0.369546,2.548887e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
380362,cv-invalid/sample-025398.mp3,,8.433924e-09,3.057379e-08,-662.156189,16.312298,-19.716421,2.727313,-4.208791,1.513487,...,-0.784522,2.043503,0.316677,1.458731,-0.803339,1.332724,2980.936279,787.610046,0.000002,8.673126e-07
380363,cv-invalid/sample-025399.mp3,,8.676028e-02,6.584325e-01,-345.374878,87.340393,45.378922,40.774281,3.435512,19.483479,...,-2.253586,5.411062,-1.232574,5.592003,-7.458000,5.937366,3907.376709,305.468781,0.003608,6.633312e-03
380364,cv-invalid/sample-025400.mp3,,4.251849e+00,5.320736e+01,-218.651810,92.312340,80.325035,51.727558,-12.459282,24.590633,...,-9.728217,8.204746,-3.846340,6.451725,-7.610143,8.534258,4074.066162,1017.008423,0.028675,4.430717e-02
380365,cv-invalid/sample-025401.mp3,,1.435417e+00,4.191814e+01,-292.477081,62.438580,46.557709,27.444952,1.516252,27.348862,...,4.728322,9.431586,-0.891293,8.314837,-6.266241,8.177511,6190.493164,1438.672363,0.010429,2.899921e-02


In [7]:
gender_df = df_dropped.dropna().reset_index(drop=True)

In [8]:
gender_df

Unnamed: 0,filename,gender,mel_energy_mean,mel_energy_std,mfcc1_mean,mfcc1_std,mfcc2_mean,mfcc2_std,mfcc3_mean,mfcc3_std,...,mfcc11_mean,mfcc11_std,mfcc12_mean,mfcc12_std,mfcc13_mean,mfcc13_std,spectral_centroid_mean,spectral_centroid_std,rms_mean,rms_std
0,cv-valid-train/sample-000005.mp3,female,0.149748,1.272535,-316.900665,103.471497,47.899082,46.986130,13.911417,24.194254,...,-11.211283,7.782578,-2.868432,8.140687,-5.234992,6.010718,3277.356445,905.106506,0.006476,0.007511
1,cv-valid-train/sample-000008.mp3,male,0.664458,7.366398,-206.309662,85.587547,41.109695,60.060863,-18.309647,32.195148,...,-0.484882,7.648843,-6.070928,10.604579,-9.777917,7.884081,3319.829834,1128.139771,0.013810,0.015923
2,cv-valid-train/sample-000013.mp3,female,4.034911,26.427383,-154.644913,107.306824,41.557034,37.320549,-41.540943,37.344933,...,-9.519749,8.841080,-0.609854,7.997808,-8.874586,7.510509,3215.189941,924.654602,0.035384,0.037558
3,cv-valid-train/sample-000014.mp3,male,2.955548,28.237516,-216.377335,109.186287,69.213486,49.862255,11.038281,27.512066,...,-8.234674,11.102791,-1.471613,7.901867,-8.920507,9.228498,4479.957520,590.930176,0.030833,0.031476
4,cv-valid-train/sample-000019.mp3,male,4.429119,20.752028,-104.562904,69.409332,71.672615,45.009129,19.541050,16.550148,...,-1.755862,6.102397,0.369734,6.173415,-3.075884,5.836502,3437.594482,727.089905,0.043010,0.032829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149275,cv-invalid/sample-025369.mp3,male,0.000000,0.000000,-799.999939,0.000000,0.000015,0.000000,-0.000043,0.000000,...,-0.000089,0.000000,0.000038,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
149276,cv-invalid/sample-025373.mp3,male,1.857215,28.951248,-94.406502,37.173935,45.251644,15.189337,-3.996834,9.444174,...,-9.512240,5.917383,-9.262706,5.536943,-9.571559,6.508159,3796.733887,868.945251,0.023397,0.026022
149277,cv-invalid/sample-025380.mp3,female,3.272084,26.640823,-193.106888,136.145905,38.969685,34.587643,-7.628333,19.475769,...,-7.445274,5.485860,-2.821240,5.789337,-2.532828,6.826046,3462.258301,510.956848,0.026545,0.038080
149278,cv-invalid/sample-025386.mp3,male,1.395478,12.516947,-228.125992,144.229797,13.274532,41.079975,-7.456383,27.683964,...,-4.001202,7.723001,-3.835645,8.190988,-5.525767,9.511915,3517.312012,780.099792,0.017425,0.024802


In [9]:
gender_df.to_csv("final_preprocessed_gender.csv", index=False)