In [2]:
import os
import pandas as pd
import numpy as np
from google.colab import files
import torch
import torchaudio
import torchaudio.functional as F
import torchaudio.transforms as T
from tqdm import tqdm

In [2]:
!curl -L -o /content/common-voice.zip https://www.kaggle.com/api/v1/datasets/download/mozillaorg/common-voice

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 12.0G  100 12.0G    0     0   118M      0  0:01:44  0:01:44 --:--:--  128M


In [3]:
!unzip "/content/common-voice.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: cv-valid-train/cv-valid-train/sample-190776.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190777.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190778.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190779.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190780.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190781.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190782.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190783.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190784.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190785.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190786.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190787.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190788.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190789.mp3  
  inflating: cv-valid-train/cv-valid-train/sample-190

In [4]:
!rm "/content/common-voice.zip"

In [5]:
data1 = pd.read_csv("/content/cv-valid-train.csv")
data2 = pd.read_csv("/content/cv-valid-test.csv")
data3 = pd.read_csv("/content/cv-valid-dev.csv")
data4 = pd.read_csv("/content/cv-other-train.csv")
data5 = pd.read_csv("/content/cv-other-test.csv")
data6 = pd.read_csv("/content/cv-other-dev.csv")
data7 = pd.read_csv("/content/cv-invalid.csv")

# Working with invalid data

In [6]:
data7.head()

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-invalid/sample-000000.mp3,revenge is not my style but obviously accident...,1,10,,,,
1,cv-invalid/sample-000001.mp3,it was bunched up and he had hardly thought of...,0,2,twenties,male,us,
2,cv-invalid/sample-000002.mp3,then suddenly he noticed it with a start,10,4,thirties,female,canada,
3,cv-invalid/sample-000003.mp3,that's the point at which most people give up,0,1,,,,
4,cv-invalid/sample-000004.mp3,you got someplace to sleep,0,1,,,,


In [6]:
data7.shape

(25403, 8)

In [7]:
# Direct check and print the result
if (data7['up_votes'] > data7['down_votes']).any():
    print("Yes, there are samples where upvotes are greater than downvotes.")
else:
    print("No samples found where upvotes are greater than downvotes.")

Yes, there are samples where upvotes are greater than downvotes.


In [8]:
filtered_invalid = data7[data7['up_votes'] > data7['down_votes']]

In [9]:
filtered_invalid.shape

(834, 8)

# Working with valid_train data

In [10]:
data1.shape

(195776, 8)

In [11]:
# Direct check and print the result
if (data1['up_votes'] < data1['down_votes']).any():
    print("Yes, there are samples where downvotes are greater than upvotes.")
else:
    print("No samples found ")

Yes, there are samples where downvotes are greater than upvotes.


In [12]:
filtered_valid_train = data1[data1['up_votes'] >= data1['down_votes']]

In [13]:
filtered_valid_train

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-train/sample-000000.mp3,learn to recognize omens and follow them the o...,1,0,,,,
1,cv-valid-train/sample-000001.mp3,everything in the universe evolved he said,1,0,,,,
2,cv-valid-train/sample-000002.mp3,you came so that you could learn about your dr...,1,0,,,,
3,cv-valid-train/sample-000003.mp3,so now i fear nothing because it was those ome...,1,0,,,,
4,cv-valid-train/sample-000004.mp3,if you start your emails with greetings let me...,3,2,,,,
...,...,...,...,...,...,...,...,...
195771,cv-valid-train/sample-195771.mp3,the englishman said nothing,1,0,thirties,male,england,
195772,cv-valid-train/sample-195772.mp3,the irish man sipped his tea,1,0,,,,
195773,cv-valid-train/sample-195773.mp3,what do you know about that,1,0,,,,
195774,cv-valid-train/sample-195774.mp3,the phone rang while she was awake,2,0,twenties,male,us,


# Working with valid_test data

In [14]:
data2.shape

(3995, 8)

In [15]:
# Direct check and print the result
if (data2['up_votes'] < data2['down_votes']).any():
    print("Yes, there are samples where downvotes are greater than upvotes.")
else:
    print("No samples found ")

Yes, there are samples where downvotes are greater than upvotes.


In [16]:
filtered_valid_test = data2[data2['up_votes'] >= data2['down_votes']]

In [17]:
filtered_valid_test

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-test/sample-000000.mp3,without the dataset the article is useless,1,0,,,,
1,cv-valid-test/sample-000001.mp3,i've got to go to him,1,0,twenties,male,,
2,cv-valid-test/sample-000002.mp3,and you know it,1,0,,,,
3,cv-valid-test/sample-000003.mp3,down below in the darkness were hundreds of pe...,4,0,twenties,male,us,
4,cv-valid-test/sample-000004.mp3,hold your nose to keep the smell from disablin...,2,0,,,,
...,...,...,...,...,...,...,...,...
3990,cv-valid-test/sample-003990.mp3,the old man opened his cape and the boy was st...,1,0,,,,
3991,cv-valid-test/sample-003991.mp3,in alchemy it's called the soul of the world,2,1,,,,
3992,cv-valid-test/sample-003992.mp3,at that point in their lives everything is cle...,3,0,,,,
3993,cv-valid-test/sample-003993.mp3,he told them all to be seated,3,0,,,,


# Working with valid_dev data

In [18]:
data3.shape

(4076, 8)

In [19]:
# Direct check and print the result
if (data3['up_votes'] < data3['down_votes']).any():
    print("Yes, there are samples where downvotes are greater than upvotes.")
else:
    print("No samples found ")

No samples found 


In [20]:
filtered_valid_dev=data3

In [21]:
filtered_valid_dev

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-valid-dev/sample-000000.mp3,be careful with your prognostications said the...,1,0,,,,
1,cv-valid-dev/sample-000001.mp3,then why should they be surprised when they se...,2,0,,,,
2,cv-valid-dev/sample-000002.mp3,a young arab also loaded down with baggage ent...,2,0,,,,
3,cv-valid-dev/sample-000003.mp3,i thought that everything i owned would be des...,3,0,,,,
4,cv-valid-dev/sample-000004.mp3,he moved about invisible but everyone could he...,1,0,fourties,female,england,
...,...,...,...,...,...,...,...,...
4071,cv-valid-dev/sample-004071.mp3,but they could never have taught him arabic,2,1,,,,
4072,cv-valid-dev/sample-004072.mp3,he decided to concentrate on more practical ma...,1,0,,,,
4073,cv-valid-dev/sample-004073.mp3,that's what i'm not supposed to say,2,0,thirties,male,us,
4074,cv-valid-dev/sample-004074.mp3,just handling them made him feel better,3,0,,,,


# Working with other_train data

In [22]:
data4.shape

(145135, 8)

In [23]:
# Direct check and print the result
if (data4['up_votes'] < data4['down_votes']).any():
    print("Yes, there are samples where downvotes are greater than upvotes.")
else:
    print("No samples found ")

No samples found 


In [24]:
filtered_other_train=data4

In [25]:
filtered_other_train

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-other-train/sample-000000.mp3,he had to spit some tobacco out of his mouth,0,0,seventies,male,england,
1,cv-other-train/sample-000001.mp3,it took her a while to get used to it,1,1,twenties,male,scotland,
2,cv-other-train/sample-000002.mp3,you will need some rubber boots,0,0,,,,
3,cv-other-train/sample-000003.mp3,you can speak a label to click on an element,0,0,fourties,male,us,
4,cv-other-train/sample-000004.mp3,the priest collapsed backwards,0,0,,,,
...,...,...,...,...,...,...,...,...
145130,cv-other-train/sample-145130.mp3,hopefully i didn't just set the voice coding c...,0,0,,,,
145131,cv-other-train/sample-145131.mp3,would you like to see where i was tattooed,0,0,,,,
145132,cv-other-train/sample-145132.mp3,the cursor blinked expectantly,0,0,fifties,male,england,
145133,cv-other-train/sample-145133.mp3,nothing's going to happen,0,0,thirties,male,england,


# Working with other_test data

In [26]:
data5.shape

(2961, 8)

In [27]:
# Direct check and print the result
if (data5['up_votes'] > data5['down_votes']).any():
    print("Yes, there are samples where downvotes are greater than upvotes.")
else:
    print("No samples found ")

No samples found 


In [28]:
filtered_other_test=data5

In [29]:
filtered_other_test

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-other-test/sample-000000.mp3,you killed all three of them for nothing,0,0,,,,
1,cv-other-test/sample-000001.mp3,will it be warmer in montana,0,0,,,,
2,cv-other-test/sample-000002.mp3,science has been arguing about the zoological ...,0,0,thirties,male,,
3,cv-other-test/sample-000003.mp3,he wasn't asking for help,1,1,,,,
4,cv-other-test/sample-000004.mp3,but he granted the travelers three days,0,0,sixties,male,us,
...,...,...,...,...,...,...,...,...
2956,cv-other-test/sample-002956.mp3,what do you mean by that,0,0,,,,
2957,cv-other-test/sample-002957.mp3,how long will it take to get him examined,0,0,,,,
2958,cv-other-test/sample-002958.mp3,stacey couldn't find the power switch,0,0,,,,
2959,cv-other-test/sample-002959.mp3,and the girl pointed to the south indicating t...,0,0,teens,male,,


# Working with other_dev data

In [30]:
data6.shape

(3022, 8)

In [31]:
# Direct check and print the result
if (data6['up_votes'] > data6['down_votes']).any():
    print("Yes, there are samples where downvotes are greater than upvotes.")
else:
    print("No samples found ")

No samples found 


In [32]:
filtered_other_dev=data6

In [33]:
filtered_other_dev

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-other-dev/sample-000000.mp3,she composed an emotional song in her bedroom,0,0,,,,
1,cv-other-dev/sample-000001.mp3,i could die happily and that made me feel good,0,0,,,,
2,cv-other-dev/sample-000002.mp3,what are you doing up here then,0,0,,,,
3,cv-other-dev/sample-000003.mp3,hundreds of observers saw the flame that night...,0,0,,,,
4,cv-other-dev/sample-000004.mp3,are you going to live with your mother,0,0,thirties,male,us,
...,...,...,...,...,...,...,...,...
3017,cv-other-dev/sample-003017.mp3,you forgot to pack it,0,0,thirties,male,scotland,
3018,cv-other-dev/sample-003018.mp3,a tow truck came to clear up the wreckage,0,0,twenties,male,england,
3019,cv-other-dev/sample-003019.mp3,i am upset that they get to have all the time ...,0,0,twenties,female,,
3020,cv-other-dev/sample-003020.mp3,i made a few changes don't worry,0,0,,,,


# Concatenating all upvoted data

In [34]:
df = pd.concat([
    filtered_other_dev,
    filtered_other_test,
    filtered_other_train,
    filtered_valid_dev,
    filtered_valid_test,
    filtered_valid_train,
    filtered_invalid
])

In [35]:
df

Unnamed: 0,filename,text,up_votes,down_votes,age,gender,accent,duration
0,cv-other-dev/sample-000000.mp3,she composed an emotional song in her bedroom,0,0,,,,
1,cv-other-dev/sample-000001.mp3,i could die happily and that made me feel good,0,0,,,,
2,cv-other-dev/sample-000002.mp3,what are you doing up here then,0,0,,,,
3,cv-other-dev/sample-000003.mp3,hundreds of observers saw the flame that night...,0,0,,,,
4,cv-other-dev/sample-000004.mp3,are you going to live with your mother,0,0,thirties,male,us,
...,...,...,...,...,...,...,...,...
25253,cv-invalid/sample-025253.mp3,this was the strangest of all things that ever...,10,2,,,,
25266,cv-invalid/sample-025266.mp3,before guns were invented armies had to throw ...,16,8,,,,
25297,cv-invalid/sample-025297.mp3,i heard a peculiar humming sound from the pit,18,2,,,,
25306,cv-invalid/sample-025306.mp3,without such love one's dreams would have no m...,25,7,,,,


# Feature Extraction Begins Here

In [36]:
# Set up GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [37]:
from concurrent.futures import ThreadPoolExecutor


In [38]:
# You may also want to verify you're using GPU in Colab specifically
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9} GB")

GPU Name: Tesla T4
GPU Memory: 15.828320256 GB


In [39]:
# Optimized Transform Initialization (Do this ONCE)
SAMPLE_RATE = 16000  # Common sample rate for speech
N_FFT = 1024         # Reduced from 2048 for speed
WIN_LENGTH = 512      # Fixed window size
HOP_LENGTH = 256      # Larger hop = fewer frames
N_MELS = 64           # Reduced from 128 to avoid warning

# Pre-initialize transforms on GPU
mel_transform = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=N_FFT,
    win_length=WIN_LENGTH,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
).to(device)

mfcc_transform = T.MFCC(
    sample_rate=SAMPLE_RATE,
    n_mfcc=13,
    melkwargs={
        'n_fft': N_FFT,
        'win_length': WIN_LENGTH,
        'hop_length': HOP_LENGTH,
        'n_mels': N_MELS
    }
).to(device)

spectrogram_transform = T.Spectrogram(
    n_fft=N_FFT,
    win_length=WIN_LENGTH,
    hop_length=HOP_LENGTH,
    power=2
).to(device)


In [40]:

def extract_audio_features(audio_path):
    try:
        with torch.no_grad():  # Critical for speed
            # Load and preprocess audio
            waveform, sample_rate = torchaudio.load(audio_path)
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            waveform = waveform.squeeze().to(device)

            # Resample if needed
            if sample_rate != SAMPLE_RATE:
                waveform = torchaudio.functional.resample(waveform, sample_rate, SAMPLE_RATE)
                sample_rate = SAMPLE_RATE

            # Skip very short files
            if len(waveform) < WIN_LENGTH:
                return None

            # Extract features
            features = {'duration': len(waveform)/sample_rate}

            # Mel Spectrogram (pre-initialized)
            mel_spec = mel_transform(waveform)
            features.update({
                'mel_energy_mean': mel_spec.mean().item(),
                'mel_energy_std': mel_spec.std().item()
            })

            # MFCCs (pre-initialized)
            mfccs = mfcc_transform(waveform).squeeze()
            for i in range(13):
                features.update({
                    f'mfcc{i+1}_mean': mfccs[i].mean().item(),
                    f'mfcc{i+1}_std': mfccs[i].std().item()
                })

            # Spectrogram-based features
            spec = spectrogram_transform(waveform)
            freqs = torch.linspace(0, sample_rate//2, spec.shape[1], device=device)

            # Spectral Centroid
            spec_sum = spec.sum(dim=1)
            spec_weighted = (spec * freqs.unsqueeze(0)).sum(dim=1)
            spectral_centroid = spec_weighted / (spec_sum + 1e-8)
            features.update({
                'spectral_centroid_mean': spectral_centroid.mean().item(),
                'spectral_centroid_std': spectral_centroid.std().item()
            })

            # Simplified RMS Energy
            frames = waveform.unfold(0, WIN_LENGTH, HOP_LENGTH)
            rms = torch.sqrt(torch.mean(frames**2, dim=1))
            features.update({
                'rms_mean': rms.mean().item(),
                'rms_std': rms.std().item()
            })

            return features

    except Exception as e:
        print(f"Error with {os.path.basename(audio_path)}: {str(e)}")
        return None

In [41]:

def process_batch(file_batch):
    return [extract_audio_features(f) for f in file_batch]


In [42]:
def main(df, root_dir, batch_size=32, workers=4):
    # Prepare file list with ALL metadata
    file_entries = []
    for _, row in df.iterrows():
        parts = row['filename'].split('/')
        if len(parts) >= 2:
            path = os.path.join(root_dir, parts[0], parts[0], '/'.join(parts[1:]))
        else:
            path = os.path.join(root_dir, row['filename'])

        if os.path.exists(path):
            # Store both path AND all metadata
            file_entries.append({
                'path': path,
                'metadata': row.to_dict()  # Save all original columns
            })

    # Process in parallel batches
    features_list = []
    batch_count = (len(file_entries) + batch_size - 1) // batch_size

    with ThreadPoolExecutor(max_workers=workers) as executor:
        futures = []
        for i in range(0, len(file_entries), batch_size):
            batch = file_entries[i:i+batch_size]
            futures.append(executor.submit(
                process_batch,
                [entry['path'] for entry in batch]
            ))

        for future_idx, future in enumerate(tqdm(futures, total=len(futures), desc="Processing batches")):
            batch_start = future_idx * batch_size
            batch_results = future.result()

            for entry_idx, features in enumerate(batch_results):
                if features:
                    # Get the corresponding metadata
                    original_idx = batch_start + entry_idx
                    metadata = file_entries[original_idx]['metadata']

                    # Merge features with ALL metadata
                    full_record = {**metadata, **features}
                    features_list.append(full_record)

    return pd.DataFrame(features_list)

In [43]:
root_dir = "/content/"

In [44]:
features_df = main(df, root_dir)

Processing batches: 100%|██████████| 11116/11116 [1:00:15<00:00,  3.07it/s]


In [1]:
features_df.to_csv("audio_features_optimized_13mfcc_text.csv", index=False)

NameError: name 'features_df' is not defined

In [4]:
features_df=pd.read_csv("/content/audio_features_optimized_13mfcc_text.csv")

In [5]:
columns_to_drop = ['up_votes','down_votes', 'age', 'accent', 'gender', 'duration']
df_dropped = features_df.drop(columns=columns_to_drop)

In [6]:
text_df = df_dropped.dropna().reset_index(drop=True)

In [7]:
text_df

Unnamed: 0,filename,text,mel_energy_mean,mel_energy_std,mfcc1_mean,mfcc1_std,mfcc2_mean,mfcc2_std,mfcc3_mean,mfcc3_std,...,mfcc11_mean,mfcc11_std,mfcc12_mean,mfcc12_std,mfcc13_mean,mfcc13_std,spectral_centroid_mean,spectral_centroid_std,rms_mean,rms_std
0,cv-other-dev/sample-000000.mp3,she composed an emotional song in her bedroom,1.192264,5.548640,-202.457489,119.633430,61.719204,59.434143,3.400874,23.527859,...,-10.290248,9.828973,2.318956,9.118623,-6.978617,8.762339,3413.846191,609.889099,0.021534,0.017921
1,cv-other-dev/sample-000001.mp3,i could die happily and that made me feel good,11.569006,67.464157,-175.999054,155.344238,45.213055,49.403877,-35.215298,30.196482,...,-7.877884,9.344677,-0.400973,7.091014,-7.392397,11.480834,4149.890137,686.548706,0.053189,0.069092
2,cv-other-dev/sample-000002.mp3,what are you doing up here then,1.241407,10.483788,-288.828430,142.216751,44.931633,43.438560,11.709400,23.032846,...,-8.283107,9.785190,-2.519360,7.930912,-4.480358,7.308479,3455.306396,537.446777,0.017302,0.022877
3,cv-other-dev/sample-000003.mp3,hundreds of observers saw the flame that night...,7.383544,83.951866,-163.577286,112.825394,21.679176,50.796188,-16.379456,33.391361,...,-12.442447,11.631681,-3.227670,11.527441,1.810717,8.291141,2947.227051,738.422119,0.045089,0.053046
4,cv-other-dev/sample-000004.mp3,are you going to live with your mother,14.301460,112.728249,-91.880669,84.635269,31.348875,37.289597,1.078362,17.835985,...,-4.250523,7.457402,-3.837495,7.888173,-5.400354,6.488866,3915.933105,510.349060,0.062811,0.074221
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355676,cv-invalid/sample-025253.mp3,this was the strangest of all things that ever...,56.697323,349.295776,-36.612419,98.852867,56.337696,43.859650,-19.600307,27.372429,...,-15.134244,9.581176,7.671854,8.929589,-9.651426,7.827696,3519.530273,704.139648,0.142565,0.130106
355677,cv-invalid/sample-025266.mp3,before guns were invented armies had to throw ...,0.734365,5.444271,-243.166214,89.742439,61.344215,61.951180,-14.325352,31.546827,...,-4.676660,10.105835,-7.034593,12.662252,-15.493694,12.867362,3688.896484,818.587036,0.015726,0.015268
355678,cv-invalid/sample-025297.mp3,i heard a peculiar humming sound from the pit,2.809416,32.642891,-153.744354,56.811359,78.526176,21.422098,-37.968369,20.163136,...,-0.339481,6.896972,-6.663773,8.853268,-6.528901,9.522890,3692.457520,799.657898,0.027617,0.033070
355679,cv-invalid/sample-025306.mp3,without such love one's dreams would have no m...,0.429223,3.460915,-263.356995,121.960907,21.673798,46.813477,12.101520,25.013504,...,-6.632248,8.482925,0.999333,5.679144,-4.723859,7.239258,2599.245117,499.343170,0.008750,0.014340
