In [1]:
!pip install datasets

Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.12.2
    Uninstalling fsspec-2023.12.2:
      Successfully uninstalled fsspec-2023.12.2
Successfully installed fsspec-2023.10.0


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from datasets import load_dataset
import pandas as pd
import os
import shutil
import json

# Define the languages
languages = ["zh-CN"]

# Function to balance the dataset by gender and age
def balance_dataset(df, num_speakers, samples_per_speaker):
    balanced_speakers = []

    # Filter out samples without necessary metadata
    df = df.dropna(subset=['gender', 'age'])

    genders = df['gender'].unique()
    ages = df['age'].unique()

    num_speakers_per_gender = num_speakers // len(genders)
    num_speakers_per_age = num_speakers // len(ages)
    num_samples_per_combination = samples_per_speaker // (len(genders) * len(ages))

    for gender in genders:
        for age in ages:
            subset = df[(df['gender'] == gender) & (df['age'] == age)]
            speaker_groups = subset.groupby('client_id')

            for client_id, group in speaker_groups:
                if len(group) >= num_samples_per_combination:
                    balanced_speakers.append(group.sample(n=num_samples_per_combination))
                if len(balanced_speakers) >= num_speakers:
                    break
            if len(balanced_speakers) >= num_speakers:
                break
        if len(balanced_speakers) >= num_speakers:
            break

    if balanced_speakers:
        balanced_df = pd.concat(balanced_speakers)
    else:
        balanced_df = pd.DataFrame()  # Return an empty DataFrame if no speakers found

    return balanced_df

# Parameters
num_speakers_per_language = 200 // len(languages)
samples_per_speaker = 100
output_dir = "~/Dokumente/NLP_daten_p3"

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load datasets and filter
balanced_datasets = []
for lang in languages:
    dataset = load_dataset("mozilla-foundation/common_voice_2_0", lang, split='train')
    dataset_df = pd.DataFrame(dataset)
    balanced_df = balance_dataset(dataset_df, num_speakers=num_speakers_per_language, samples_per_speaker=samples_per_speaker)
    if not balanced_df.empty:
        balanced_datasets.append(balanced_df)

# Combine all balanced datasets into one
if balanced_datasets:
    combined_df = pd.concat(balanced_datasets, ignore_index=True)

    # Get unique client IDs and other necessary information
    unique_client_ids = combined_df['client_id'].unique()

    # Print the number of unique client IDs and other details
    print(f'Number of unique client IDs: {len(unique_client_ids)}')
    print(unique_client_ids)
    print(combined_df.head())

    # Check the distribution of gender and age
    print(combined_df['gender'].value_counts())
    print(combined_df['age'].value_counts())

    # Save audio files and metadata locally
    metadata = []
    for _, row in combined_df.iterrows():
        audio_path = row['path']
        client_id = row['client_id']
        gender = row['gender']
        age = row['age']

        # Create a new path for the audio file in the output directory
        new_audio_path = os.path.join(output_dir, os.path.basename(audio_path))

        # Copy the audio file
        shutil.copy(audio_path, new_audio_path)

        # Append metadata
        metadata.append({
            'client_id': client_id,
            'gender': gender,
            'age': age,
            'original_path': audio_path,
            'new_path': new_audio_path
        })

    # Save metadata to a JSON file
    metadata_path = os.path.join(output_dir, 'metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)

    print(f"Metadata and audio files have been saved to {output_dir}")
else:
    print("No balanced data could be extracted.")

Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.59k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/16.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/362M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

Number of unique client IDs: 9
['2244dbc149f9716b7582ef61a2e6e5283de18154955fdb1cc812947f2e733ef859fc2303a30e42d97479c8ef6168d034b10135479d469b600bc731547f1cf313'
 '9678e4e3a3117867f408524d6417b17bf7d7378db80b49d0a76adda3164ad9dcce08a684ec357de5defb79a5b914dcba0e5f6c9e468818b08a01c80063347eb9'
 'a98b75688138833376c2638cfcfa5102a127101233bedb358e8faa9393c725c5e3119f7baf6b7da53cb8c672368b32259abff87848cfd23caba31ac44c546079'
 '717c6bc706613454961f3d0f9359d491e656ad6d04e790bc95703abe0dccbab96c6b3a9788afd98755ac60443e8534619c555406839a88ad69581acd3634dc45'
 '89ea39bed3dfc8841e72cc4f6e99bb5699e19863d30bf4e612d99ba360885113e9440ce28cea45f9f233dc89472df039d352b25fd24efa9f0e09e85eace98514'
 'b6a806baeb1824fb6beec68581d9fd80a779d5adb53e27087cd99f3b23d988a18a4918bddca1fc9a6b0691eafcb8a9921edd8127f63a48bb6c99ff739b3285cc'
 '613550964e86ae5803236580fce1227a978910e52f31450ad2e9cb9596fe654a1968ca136c15212a9ec0824a4bad0075d5a31212e92d2f930532a18f93e5af2b'
 'c3305f0ade3d797264fb4e7776b546b4ee31fef4f49

In [8]:
from datasets import load_dataset
import pandas as pd
import os
import shutil
import json

# Define the languages
languages = ["zh-CN"]


# Function to balance the dataset by gender and age
def balance_dataset(df, num_speakers, samples_per_speaker):
    balanced_speakers = []

    # Filter out samples without necessary metadata
    df = df.dropna(subset=["gender", "age"])

    genders = df["gender"].unique()
    ages = df["age"].unique()

    num_speakers_per_gender = num_speakers // len(genders)
    num_speakers_per_age = num_speakers // len(ages)
    num_samples_per_combination = samples_per_speaker // (len(genders) * len(ages))

    for gender in genders:
        for age in ages:
            subset = df[(df["gender"] == gender) & (df["age"] == age)]
            speaker_groups = subset.groupby("client_id")

            for client_id, group in speaker_groups:
                if len(group) >= num_samples_per_combination:
                    balanced_speakers.append(
                        group.sample(n=num_samples_per_combination)
                    )
                if len(balanced_speakers) >= num_speakers:
                    break
            if len(balanced_speakers) >= num_speakers:
                break
        if len(balanced_speakers) >= num_speakers:
            break

    if balanced_speakers:
        balanced_df = pd.concat(balanced_speakers)
    else:
        balanced_df = pd.DataFrame()  # Return an empty DataFrame if no speakers found

    return balanced_df


# Parameters
num_speakers_per_language = 200 // len(languages)
samples_per_speaker = 100
output_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load datasets and filter
balanced_datasets = []
for lang in languages:
    dataset = load_dataset("mozilla-foundation/common_voice_2_0", lang, split="train")
    dataset_df = pd.DataFrame(dataset)
    balanced_df = balance_dataset(
        dataset_df,
        num_speakers=num_speakers_per_language,
        samples_per_speaker=samples_per_speaker,
    )
    if not balanced_df.empty:
        balanced_datasets.append(balanced_df)

# Combine all balanced datasets into one
if balanced_datasets:
    combined_df = pd.concat(balanced_datasets, ignore_index=True)

    if combined_df.empty:
        print("Combined DataFrame is empty. No files to process.")
    else:
        # Get unique client IDs and other necessary information
        unique_client_ids = combined_df["client_id"].unique()

        # Print the number of unique client IDs and other details
        print(f"Number of unique client IDs: {len(unique_client_ids)}")
        print(unique_client_ids)
        print(combined_df.head())

        # Check the distribution of gender and age
        print(combined_df["gender"].value_counts())
        print(combined_df["age"].value_counts())

        # Save audio files and metadata locally
        metadata = []
        for _, row in combined_df.iterrows():
            audio_path = row["path"]
            client_id = row["client_id"]
            gender = row["gender"]
            age = row["age"]

            # Ensure audio path exists
            if not os.path.isfile(audio_path):
                print(f"Audio file not found: {audio_path}")
                continue

            # Create a new path for the audio file in the output directory
            new_audio_path = os.path.join(output_dir, os.path.basename(audio_path))

            # Copy the audio file
            shutil.copy(audio_path, new_audio_path)
            if not os.path.isfile(new_audio_path):
                print(f"Failed to copy audio file to: {new_audio_path}")

            # Append metadata
            metadata.append(
                {
                    "client_id": client_id,
                    "gender": gender,
                    "age": age,
                    "original_path": audio_path,
                    "new_path": new_audio_path,
                }
            )

        # Save metadata to a JSON file
        metadata_path = os.path.join(output_dir, "metadata.json")
        with open(metadata_path, "w") as f:
            json.dump(metadata, f, indent=4)

        if not os.path.isfile(metadata_path):
            print(f"Failed to write metadata to: {metadata_path}")
        else:
            print(f"Metadata and audio files have been saved to {output_dir}")
else:
    print("No balanced data could be extracted.")

Number of unique client IDs: 9
['2244dbc149f9716b7582ef61a2e6e5283de18154955fdb1cc812947f2e733ef859fc2303a30e42d97479c8ef6168d034b10135479d469b600bc731547f1cf313'
 '9678e4e3a3117867f408524d6417b17bf7d7378db80b49d0a76adda3164ad9dcce08a684ec357de5defb79a5b914dcba0e5f6c9e468818b08a01c80063347eb9'
 'a98b75688138833376c2638cfcfa5102a127101233bedb358e8faa9393c725c5e3119f7baf6b7da53cb8c672368b32259abff87848cfd23caba31ac44c546079'
 '717c6bc706613454961f3d0f9359d491e656ad6d04e790bc95703abe0dccbab96c6b3a9788afd98755ac60443e8534619c555406839a88ad69581acd3634dc45'
 '89ea39bed3dfc8841e72cc4f6e99bb5699e19863d30bf4e612d99ba360885113e9440ce28cea45f9f233dc89472df039d352b25fd24efa9f0e09e85eace98514'
 'b6a806baeb1824fb6beec68581d9fd80a779d5adb53e27087cd99f3b23d988a18a4918bddca1fc9a6b0691eafcb8a9921edd8127f63a48bb6c99ff739b3285cc'
 '613550964e86ae5803236580fce1227a978910e52f31450ad2e9cb9596fe654a1968ca136c15212a9ec0824a4bad0075d5a31212e92d2f930532a18f93e5af2b'
 'c3305f0ade3d797264fb4e7776b546b4ee31fef4f49

In [6]:
from datasets import load_dataset
import pandas as pd
import os
import shutil
import json
from tqdm import tqdm

# Define the languages
LANGUAGES = {
    "zh-CN": "Chinese (China)",
    "en": "English",
    "de": "German",
    "fr": "French",
    "es": "Spanish",
}

AVAILABLE_LANGUAGES = [
    "en",
    "de",
    "fr",
    "cy",
    "br",
    "cv",
    "tr",
    "tt",
    "ky",
    "ga-IE",
    "kab",
    "ca",
    "zh-TW",
    "sl",
    "it",
    "nl",
    "cnh",
    "eo",
    "et",
    "eu",
    "es",
    "zh-CN",
    "mn",
    "sah",
    "dv",
    "rw",
    "sv-SE",
    "ru",
]


# Function to balance the dataset by gender and age
def balance_dataset(df, num_speakers, samples_per_speaker):
    balanced_speakers = []

    # Filter out samples without necessary metadata
    df = df.dropna(subset=["gender", "age"])

    genders = df["gender"].unique()
    ages = df["age"].unique()

    num_speakers_per_gender = num_speakers // len(genders)
    num_speakers_per_age = num_speakers // len(ages)
    num_samples_per_combination = samples_per_speaker // (len(genders) * len(ages))

    for gender in genders:
        for age in ages:
            subset = df[(df["gender"] == gender) & (df["age"] == age)]
            speaker_groups = subset.groupby("client_id")

            for client_id, group in speaker_groups:
                if len(group) >= num_samples_per_combination:
                    balanced_speakers.append(
                        group.sample(n=num_samples_per_combination)
                    )
                if len(balanced_speakers) >= num_speakers:
                    break
            if len(balanced_speakers) >= num_speakers:
                break
        if len(balanced_speakers) >= num_speakers:
            break

    if balanced_speakers:
        balanced_df = pd.concat(balanced_speakers)
    else:
        balanced_df = pd.DataFrame()  # Return an empty DataFrame if no speakers found

    return balanced_df


# Parameters
num_speakers_per_language = 20
samples_per_speaker = 100
output_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Load datasets and filter
balanced_datasets = []
for lang_code, lang_name in LANGUAGES.items():
    if lang_code in AVAILABLE_LANGUAGES:
        print(f"Loading dataset for {lang_name} ({lang_code})...")
        dataset = load_dataset(
            "mozilla-foundation/common_voice_2_0", lang_code, split="train"
        )
        dataset_df = pd.DataFrame(dataset)

        # Sample up to 20 unique speakers
        unique_speakers = dataset_df["client_id"].unique()
        sampled_speakers = unique_speakers[:20]
        sampled_df = dataset_df[dataset_df["client_id"].isin(sampled_speakers)]

        balanced_df = balance_dataset(
            sampled_df,
            num_speakers=num_speakers_per_language,
            samples_per_speaker=samples_per_speaker,
        )
        if not balanced_df.empty:
            balanced_datasets.append((lang_code, balanced_df))
    else:
        print(f"Language {lang_name} ({lang_code}) is not available in the dataset.")

# Combine all balanced datasets into one
if balanced_datasets:
    for lang_code, balanced_df in balanced_datasets:
        if balanced_df.empty:
            print(
                f"Combined DataFrame for {LANGUAGES[lang_code]} is empty. No files to process."
            )
        else:
            # Get unique client IDs and other necessary information
            unique_client_ids = balanced_df["client_id"].unique()

            # Print the number of unique client IDs and other details
            print(
                f"Number of unique client IDs for {LANGUAGES[lang_code]}: {len(unique_client_ids)}"
            )
            print(unique_client_ids)
            print(balanced_df.head())

            # Check the distribution of gender and age
            gender_distribution = balanced_df["gender"].value_counts()
            age_distribution = balanced_df["age"].value_counts()

            print(f"Gender Distribution for {LANGUAGES[lang_code]}:")
            print(gender_distribution)
            print(f"\nAge Distribution for {LANGUAGES[lang_code]}:")
            print(age_distribution)

            # Save audio files and metadata locally
            metadata = []
            for _, row in tqdm(
                balanced_df.iterrows(),
                total=balanced_df.shape[0],
                desc=f"Processing files for {LANGUAGES[lang_code]}",
            ):
                audio_path = row["path"]
                client_id = row["client_id"]
                gender = row["gender"]
                age = row["age"]

                # Ensure audio path exists
                if not os.path.isfile(audio_path):
                    print(f"Audio file not found: {audio_path}")
                    continue

                # Create the new directory structure
                new_dir = os.path.join(output_dir, lang_code, gender, age)
                os.makedirs(new_dir, exist_ok=True)
                new_audio_path = os.path.join(new_dir, os.path.basename(audio_path))

                # Copy the audio file
                shutil.copy(audio_path, new_audio_path)
                if not os.path.isfile(new_audio_path):
                    print(f"Failed to copy audio file to: {new_audio_path}")

                # Append metadata
                metadata.append(
                    {
                        "client_id": client_id,
                        "gender": gender,
                        "age": age,
                        "original_path": audio_path,
                        "new_path": new_audio_path,
                    }
                )

            # Save metadata to a JSON file
            metadata_path = os.path.join(output_dir, lang_code, "metadata.json")
            with open(metadata_path, "w") as f:
                json.dump(metadata, f, indent=4)

            if not os.path.isfile(metadata_path):
                print(f"Failed to write metadata to: {metadata_path}")
            else:
                print(
                    f"Metadata and audio files have been saved to {os.path.join(output_dir, lang_code)}"
                )

            # Print metadata summary
            print(f"\nMetadata Summary for {LANGUAGES[lang_code]}:")
            metadata_df = pd.DataFrame(metadata)
            print("Gender Distribution:")
            print(metadata_df["gender"].value_counts())
            print("\nAge Distribution:")
            print(metadata_df["age"].value_counts())
else:
    print("No balanced data could be extracted.")

Loading dataset for Chinese (China) (zh-CN)...
Loading dataset for English (en)...


Downloading data:   0%|          | 0.00/30.5G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

: 

In [1]:
import os
from datasets import load_dataset

# Define the directory where datasets will be saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# List of languages to download
languages = ["en", "zh-CN", "fr", "es", "de"]


# Function to download dataset and metadata for a given language
def download_dataset(language_code):
    # Create a directory for the language
    language_dir = os.path.join(base_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the dataset
    dataset = load_dataset(
        "mozilla-foundation/common_voice_2_0", language_code, split="train"
    )

    # Save the dataset
    dataset.to_csv(os.path.join(language_dir, f"{language_code}_data.csv"))

    # Load and save the metadata
    metadata = dataset.info
    with open(os.path.join(language_dir, f"{language_code}_metadata.txt"), "w") as f:
        f.write(str(metadata))


# Download datasets and metadata for each language
for language in languages:
    download_dataset(language)

Creating CSV from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Downloading data:   0%|          | 0.00/5.19G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Downloading data:   0%|          | 0.00/873M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Downloading data:   0%|          | 0.00/9.61G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

In [2]:
import os
import pandas as pd
import shutil

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "results")

# List of languages to process
languages = ["en", "zh-CN", "fr", "es", "de"]

# Age categories as provided in the dataset
age_categories = [
    "teens",
    "twenties",
    "thirties",
    "fourties",
    "fifties",
    "sixties",
    "seventies",
    "eighties",
    "nineties",
]

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)


# Function to process each language
def process_language(language_code):
    # Create directories for the language
    language_dir = os.path.join(result_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information
    df = df.dropna(subset=["gender", "age"])

    # Separate by gender
    males = df[df["gender"] == "male"]
    females = df[df["gender"] == "female"]

    # Get 20 unique speakers for each gender
    male_speakers = males["client_id"].unique()[:20]
    female_speakers = females["client_id"].unique()[:20]

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, speakers):
        for speaker in speakers:
            speaker_data = df[df["client_id"] == speaker]
            for _, row in speaker_data.iterrows():
                if row["age"] not in age_categories:
                    continue
                dest_dir = os.path.join(language_dir, gender, row["age"])
                os.makedirs(dest_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(dest_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

    # Save audio files for males and females
    save_audio_files("male", male_speakers)
    save_audio_files("female", female_speakers)


# Process each language
for language in languages:
    process_language(language)

In [1]:
import os
import pandas as pd
import shutil
import json

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "results")

# List of languages to process
languages = ["en", "zh-CN", "fr", "es", "de"]

# Age categories as provided in the dataset
age_categories = [
    "teens",
    "twenties",
    "thirties",
    "fourties",
    "fifties",
    "sixties",
    "seventies",
    "eighties",
    "nineties",
]

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)


# Function to process each language
def process_language(language_code):
    # Create directories for the language
    language_dir = os.path.join(result_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information
    df = df.dropna(subset=["gender", "age"])

    # Separate by gender
    males = df[df["gender"] == "male"]
    females = df[df["gender"] == "female"]

    # Get 20 unique speakers for each gender
    male_speakers = males["client_id"].unique()[:20]
    female_speakers = females["client_id"].unique()[:20]

    metadata_list = []

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, speakers):
        for speaker in speakers:
            speaker_data = df[df["client_id"] == speaker]
            for _, row in speaker_data.iterrows():
                if row["age"] not in age_categories:
                    continue
                dest_dir = os.path.join(language_dir, gender, row["age"])
                os.makedirs(dest_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(dest_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

                # Add to metadata list
                metadata_list.append(
                    {
                        "file_path": dest_path,
                        "age": row["age"],
                        "gender": row["gender"],
                        "language": language_code,
                        "unique_speaker_id": speaker,
                    }
                )

    # Save audio files for males and females
    save_audio_files("male", male_speakers)
    save_audio_files("female", female_speakers)

    # Save metadata to JSON
    with open(os.path.join(language_dir, f"{language_code}_metadata.json"), "w") as f:
        json.dump(metadata_list, f, indent=4)


# Process each language
for language in languages:
    process_language(language)

In [3]:
import os
import pandas as pd
import shutil
import json

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "results")

# List of languages to process
languages = ["en", "zh-CN", "fr", "es", "de"]

# Combined age categories
combined_age_categories = {
    "youth": ["teens", "twenties"],
    "adult": ["thirties", "fourties", "fifties"],
    "senior": ["sixties", "seventies", "eighties", "nineties"],
}

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)


# Function to get combined age category
def get_combined_age_category(age):
    for category, ages in combined_age_categories.items():
        if age in ages:
            return category
    return None


# Function to process each language
def process_language(language_code):
    # Create directories for the language
    language_dir = os.path.join(result_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information
    df = df.dropna(subset=["gender", "age"])

    # Separate by gender
    males = df[df["gender"] == "male"]
    females = df[df["gender"] == "female"]

    # Get 20 unique speakers for each gender
    male_speakers = males["client_id"].unique()[:20]
    female_speakers = females["client_id"].unique()[:20]

    metadata_list = []

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, speakers):
        for speaker in speakers:
            speaker_data = df[df["client_id"] == speaker]
            for _, row in speaker_data.iterrows():
                age_category = get_combined_age_category(row["age"])
                if not age_category:
                    continue
                dest_dir = os.path.join(language_dir, gender, age_category)
                os.makedirs(dest_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(dest_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

                # Add to metadata list
                metadata_list.append(
                    {
                        "file_path": dest_path,
                        "age": row["age"],
                        "gender": row["gender"],
                        "language": language_code,
                        "unique_speaker_id": speaker,
                    }
                )

    # Save audio files for males and females
    save_audio_files("male", male_speakers)
    save_audio_files("female", female_speakers)

    # Save metadata to JSON
    with open(os.path.join(language_dir, f"{language_code}_metadata.json"), "w") as f:
        json.dump(metadata_list, f, indent=4)


# Process each language
for language in languages:
    process_language(language)

In [4]:
import os
import pandas as pd
import shutil
import json
from collections import defaultdict

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "results")

# List of languages to process
languages = ["en", "zh-CN", "fr", "es", "de"]

# Combined age categories
combined_age_categories = {
    "youth": ["teens", "twenties"],
    "adult": ["thirties", "fourties", "fifties"],
    "senior": ["sixties", "seventies", "eighties", "nineties"],
}

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)


# Function to get combined age category
def get_combined_age_category(age):
    for category, ages in combined_age_categories.items():
        if age in ages:
            return category
    return None


# Function to process each language
def process_language(language_code):
    # Create directories for the language
    language_dir = os.path.join(result_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information
    df = df.dropna(subset=["gender", "age"])

    # Prepare to store metadata
    metadata_list = []
    language_summary = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, age_category, speakers):
        for speaker in speakers:
            speaker_data = df[
                (df["client_id"] == speaker)
                & (df["age"].isin(combined_age_categories[age_category]))
            ]
            for _, row in speaker_data.iterrows():
                dest_dir = os.path.join(language_dir, gender, age_category)
                os.makedirs(dest_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(dest_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

                # Add to metadata list
                metadata_list.append(
                    {
                        "file_path": dest_path,
                        "age": row["age"],
                        "gender": row["gender"],
                        "language": language_code,
                        "unique_speaker_id": speaker,
                    }
                )

                # Update language summary
                language_summary[language_code][gender][age_category] += 1

    # Process each gender and age category
    for gender in ["male", "female"]:
        for age_category in combined_age_categories:
            speakers = df[
                (df["gender"] == gender)
                & (df["age"].isin(combined_age_categories[age_category]))
            ]["client_id"].unique()[:5]
            if len(speakers) < 5:
                print(
                    f"Warning: Not enough {gender} speakers in {age_category} category for language {language_code}"
                )
            save_audio_files(gender, age_category, speakers)

    # Save metadata to JSON for each language
    with open(os.path.join(language_dir, f"{language_code}_metadata.json"), "w") as f:
        json.dump(metadata_list, f, indent=4)

    return language_summary


# Summary for all languages
all_languages_summary = {}

# Process each language
for language in languages:
    language_summary = process_language(language)
    all_languages_summary.update(language_summary)

# Save the overall summary to JSON
with open(os.path.join(result_dir, "summary_metadata.json"), "w") as f:
    json.dump(all_languages_summary, f, indent=4)

print("Processing complete.")

Processing complete.


In [6]:
import os
import pandas as pd
import shutil
import json
from collections import defaultdict

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "results")

# List of languages to process
languages = ["en", "zh-CN", "fr", "es", "de"]

# Combined age categories
combined_age_categories = {
    "youth": ["teens", "twenties"],
    "adult": ["thirties", "fourties", "fifties"],
    "senior": ["sixties", "seventies", "eighties", "nineties"],
}

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)


# Function to get combined age category
def get_combined_age_category(age):
    for category, ages in combined_age_categories.items():
        if age in ages:
            return category
    return None


# Function to process each language
def process_language(language_code):
    # Create directories for the language
    language_dir = os.path.join(result_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information
    df = df.dropna(subset=["gender", "age"])

    # Prepare to store metadata
    metadata_list = []
    language_summary = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(lambda: {"unique_speakers": 0, "clips": 0})
        )
    )

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, age_category, speakers):
        for speaker in speakers:
            speaker_data = df[
                (df["client_id"] == speaker)
                & (df["age"].isin(combined_age_categories[age_category]))
            ]
            for _, row in speaker_data.iterrows():
                dest_dir = os.path.join(language_dir, gender, age_category)
                os.makedirs(dest_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(dest_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

                # Add to metadata list
                metadata_list.append(
                    {
                        "file_path": dest_path,
                        "age": row["age"],
                        "gender": row["gender"],
                        "language": language_code,
                        "unique_speaker_id": speaker,
                    }
                )

                # Update language summary
                language_summary[language_code][gender][age_category]["clips"] += 1

            language_summary[language_code][gender][age_category][
                "unique_speakers"
            ] += 1

    # Process each gender and age category
    for gender in ["male", "female"]:
        for age_category in combined_age_categories:
            speakers = df[
                (df["gender"] == gender)
                & (df["age"].isin(combined_age_categories[age_category]))
            ]["client_id"].unique()[:5]
            if len(speakers) < 5:
                print(
                    f"Warning: Not enough {gender} speakers in {age_category} category for language {language_code}"
                )
            save_audio_files(gender, age_category, speakers)

    # Save metadata to JSON for each language
    with open(os.path.join(language_dir, f"{language_code}_metadata.json"), "w") as f:
        json.dump(metadata_list, f, indent=4)

    return language_summary


# Summary for all languages
all_languages_summary = defaultdict(
    lambda: defaultdict(lambda: defaultdict(lambda: {"unique_speakers": 0, "clips": 0}))
)

# Process each language
for language in languages:
    language_summary = process_language(language)
    for lang, gender_data in language_summary.items():
        for gender, age_data in gender_data.items():
            for age_category, counts in age_data.items():
                all_languages_summary[lang][gender][age_category][
                    "unique_speakers"
                ] += counts["unique_speakers"]
                all_languages_summary[lang][gender][age_category]["clips"] += counts[
                    "clips"
                ]

# Save the overall summary to JSON
with open(os.path.join(result_dir, "summary_metadata.json"), "w") as f:
    json.dump(all_languages_summary, f, indent=4)

print("Processing complete.")

Processing complete.
