In [2]:
!pip install datasets

Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.19.1 dill-0.3.8 multiprocess-0.70.16 xxhash-3.4.1


In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from datasets import load_dataset
import pandas as pd
import os
import shutil
import json

# Define the languages
languages = ["zh-CN"]

# Function to balance the dataset by gender and age
def balance_dataset(df, num_speakers, samples_per_speaker):
    balanced_speakers = []

    # Filter out samples without necessary metadata
    df = df.dropna(subset=['gender', 'age'])

    genders = df['gender'].unique()
    ages = df['age'].unique()

    num_speakers_per_gender = num_speakers // len(genders)
    num_speakers_per_age = num_speakers // len(ages)
    num_samples_per_combination = samples_per_speaker // (len(genders) * len(ages))

    for gender in genders:
        for age in ages:
            subset = df[(df['gender'] == gender) & (df['age'] == age)]
            speaker_groups = subset.groupby('client_id')

            for client_id, group in speaker_groups:
                if len(group) >= num_samples_per_combination:
                    balanced_speakers.append(group.sample(n=num_samples_per_combination))
                if len(balanced_speakers) >= num_speakers:
                    break
            if len(balanced_speakers) >= num_speakers:
                break
        if len(balanced_speakers) >= num_speakers:
            break

    if balanced_speakers:
        balanced_df = pd.concat(balanced_speakers)
    else:
        balanced_df = pd.DataFrame()  # Return an empty DataFrame if no speakers found

    return balanced_df

# Parameters
num_speakers_per_language = 200 // len(languages)
samples_per_speaker = 100
output_dir = '/content/drive/MyDrive/enhancing_speaker_recognition_evaluation/data/balanced_common_voice_samples'

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load datasets and filter
balanced_datasets = []
for lang in languages:
    dataset = load_dataset("mozilla-foundation/common_voice_2_0", lang, split='train')
    dataset_df = pd.DataFrame(dataset)
    balanced_df = balance_dataset(dataset_df, num_speakers=num_speakers_per_language, samples_per_speaker=samples_per_speaker)
    if not balanced_df.empty:
        balanced_datasets.append(balanced_df)

# Combine all balanced datasets into one
if balanced_datasets:
    combined_df = pd.concat(balanced_datasets, ignore_index=True)

    # Get unique client IDs and other necessary information
    unique_client_ids = combined_df['client_id'].unique()

    # Print the number of unique client IDs and other details
    print(f'Number of unique client IDs: {len(unique_client_ids)}')
    print(unique_client_ids)
    print(combined_df.head())

    # Check the distribution of gender and age
    print(combined_df['gender'].value_counts())
    print(combined_df['age'].value_counts())

    # Save audio files and metadata locally
    metadata = []
    for _, row in combined_df.iterrows():
        audio_path = row['path']
        client_id = row['client_id']
        gender = row['gender']
        age = row['age']

        # Create a new path for the audio file in the output directory
        new_audio_path = os.path.join(output_dir, os.path.basename(audio_path))

        # Copy the audio file
        shutil.copy(audio_path, new_audio_path)

        # Append metadata
        metadata.append({
            'client_id': client_id,
            'gender': gender,
            'age': age,
            'original_path': audio_path,
            'new_path': new_audio_path
        })

    # Save metadata to a JSON file
    metadata_path = os.path.join(output_dir, 'metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)

    print(f"Metadata and audio files have been saved to {output_dir}")
else:
    print("No balanced data could be extracted.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/362M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

Number of unique client IDs: 9
['2244dbc149f9716b7582ef61a2e6e5283de18154955fdb1cc812947f2e733ef859fc2303a30e42d97479c8ef6168d034b10135479d469b600bc731547f1cf313'
 '9678e4e3a3117867f408524d6417b17bf7d7378db80b49d0a76adda3164ad9dcce08a684ec357de5defb79a5b914dcba0e5f6c9e468818b08a01c80063347eb9'
 'a98b75688138833376c2638cfcfa5102a127101233bedb358e8faa9393c725c5e3119f7baf6b7da53cb8c672368b32259abff87848cfd23caba31ac44c546079'
 '717c6bc706613454961f3d0f9359d491e656ad6d04e790bc95703abe0dccbab96c6b3a9788afd98755ac60443e8534619c555406839a88ad69581acd3634dc45'
 '89ea39bed3dfc8841e72cc4f6e99bb5699e19863d30bf4e612d99ba360885113e9440ce28cea45f9f233dc89472df039d352b25fd24efa9f0e09e85eace98514'
 'b6a806baeb1824fb6beec68581d9fd80a779d5adb53e27087cd99f3b23d988a18a4918bddca1fc9a6b0691eafcb8a9921edd8127f63a48bb6c99ff739b3285cc'
 '613550964e86ae5803236580fce1227a978910e52f31450ad2e9cb9596fe654a1968ca136c15212a9ec0824a4bad0075d5a31212e92d2f930532a18f93e5af2b'
 'c3305f0ade3d797264fb4e7776b546b4ee31fef4f49