In [1]:
!pip install datasets

Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.12.2
    Uninstalling fsspec-2023.12.2:
      Successfully uninstalled fsspec-2023.12.2
Successfully installed fsspec-2023.10.0


In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# download dataset using this script

In [14]:
import os
from datasets import load_dataset

# Define the directory where datasets will be saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# List of languages to download
languages = ["en", "zh-CN", "fr", "es", "de","zh-TW","it","ca"]


# Function to download dataset and metadata for a given language
def download_dataset(language_code):
    # Create a directory for the language
    language_dir = os.path.join(base_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the dataset
    dataset = load_dataset(
        "mozilla-foundation/common_voice_2_0", language_code, split="train"
    )

    # Save the dataset
    dataset.to_csv(os.path.join(language_dir, f"{language_code}_data.csv"))

    # Load and save the metadata
    metadata = dataset.info
    with open(os.path.join(language_dir, f"{language_code}_metadata.txt"), "w") as f:
        f.write(str(metadata))


# Download datasets and metadata for each language
for language in languages:
    download_dataset(language)

Creating CSV from Arrow format:   0%|          | 0/60 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Downloading data:   0%|          | 0.00/3.41G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating other split: 0 examples [00:00, ? examples/s]

Generating invalidated split: 0 examples [00:00, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

# use this file to create a balanced dataset

In [15]:
import os
import pandas as pd
import shutil
import json
from collections import defaultdict

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "results")

# List of languages to process
languages = ["en", "zh-TW", "fr", "ca", "de"]

# Combined age categories
combined_age_categories = {
    "youth": ["teens", "twenties"],
    "adult": ["thirties", "fourties", "fifties"],
    "senior": ["sixties", "seventies", "eighties", "nineties"],
}

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)


# Function to get combined age category
def get_combined_age_category(age):
    for category, ages in combined_age_categories.items():
        if age in ages:
            return category
    return None


# Function to process each language
def process_language(language_code):
    # Create directories for the language
    language_dir = os.path.join(result_dir, language_code)
    os.makedirs(language_dir, exist_ok=True)

    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information
    df = df.dropna(subset=["gender", "age"])

    # Prepare to store metadata
    metadata_list = []
    language_summary = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(lambda: {"unique_speakers": 0, "clips": 0})
        )
    )

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, age_category, speakers):
        for speaker in speakers:
            speaker_data = df[
                (df["client_id"] == speaker)
                & (df["age"].isin(combined_age_categories[age_category]))
            ]
            for _, row in speaker_data.iterrows():
                speaker_dir = os.path.join(
                    language_dir, gender, age_category, f"speaker_{speaker}"
                )
                os.makedirs(speaker_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(speaker_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

                # Add to metadata list
                metadata_list.append(
                    {
                        "file_path": dest_path,
                        "age": row["age"],
                        "gender": row["gender"],
                        "language": language_code,
                        "unique_speaker_id": speaker,
                    }
                )

                # Update language summary
                language_summary[language_code][gender][age_category]["clips"] += 1

            language_summary[language_code][gender][age_category][
                "unique_speakers"
            ] += 1

    # Process each gender and age category
    for gender in ["male", "female"]:
        for age_category in combined_age_categories:
            speakers = df[
                (df["gender"] == gender)
                & (df["age"].isin(combined_age_categories[age_category]))
            ]["client_id"].unique()[:5]
            if len(speakers) < 5:
                print(
                    f"Warning: Not enough {gender} speakers in {age_category} category for language {language_code}"
                )
            save_audio_files(gender, age_category, speakers)

    # Save metadata to JSON for each language
    with open(os.path.join(language_dir, f"{language_code}_metadata.json"), "w") as f:
        json.dump(metadata_list, f, indent=4)

    return language_summary


# Summary for all languages
all_languages_summary = defaultdict(
    lambda: defaultdict(lambda: defaultdict(lambda: {"unique_speakers": 0, "clips": 0}))
)

# Process each language
for language in languages:
    language_summary = process_language(language)
    for lang, gender_data in language_summary.items():
        for gender, age_data in gender_data.items():
            for age_category, counts in age_data.items():
                all_languages_summary[lang][gender][age_category][
                    "unique_speakers"
                ] += counts["unique_speakers"]
                all_languages_summary[lang][gender][age_category]["clips"] += counts[
                    "clips"
                ]

# Save the overall summary to JSON
with open(os.path.join(result_dir, "summary_metadata.json"), "w") as f:
    json.dump(all_languages_summary, f, indent=4)

print("Processing complete.")

Processing complete.


# now lets push the balanced dataset back up

In [16]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
from huggingface_hub import notebook_login, HfApi, Repository
import subprocess
import os


# Function to check the status of the repository
def check_git_status(repo_path):
    result = subprocess.run(
        ["git", "status"],
        cwd=repo_path,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        encoding="utf-8",
    )
    print(result.stdout)
    return bool(result.stdout.strip())


# Function to check if there are changes to commit
def has_changes(repo_path):
    result = subprocess.run(
        ["git", "status", "--porcelain"],
        cwd=repo_path,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        encoding="utf-8",
    )
    return bool(result.stdout.strip())


# Authentifizierung bei Hugging Face
notebook_login()

# Repository-Name und -Pfad
repo_name = "slinusc/CommonVoiceSubset"
repo_path = os.path.expanduser(
    "~/Dokumente/NLP_daten_p3/git_repo"
)  # Lokaler Pfad zu den Daten, die hochgeladen werden sollen

# Hugging Face API-Client
api = HfApi()

# Repository erstellen (falls es noch nicht existiert)
api.create_repo(repo_name, exist_ok=True)

# Repository initialisieren
repo = Repository(local_dir=repo_path, clone_from=repo_name)

# Dateien zum Repository hinzufügen und pushen
repo.git_add(pattern="*")

# Check the status of the repository
print("Git status before committing:")
check_git_status(repo_path)

# Check if there are changes before committing
if has_changes(repo_path):
    repo.git_commit("Add data files")
    repo.git_push()
    print("Data files committed and pushed successfully.")
else:
    print("No changes to commit")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/sam/Dokumente/NLP_daten_p3/git_repo is already a clone of https://huggingface.co/slinusc/CommonVoiceSubset. Make sure you pull the latest changes with `repo.git_pull()`.


Git status before committing:
Auf Branch main
nichts zu committen, Arbeitsverzeichnis unverändert

No changes to commit


In [2]:
from huggingface_hub import HfApi, notebook_login

# Authenticate to Hugging Face
notebook_login()

# Initialize the Hugging Face API client
api = HfApi()

# Define the local folder path and repository details
folder_path = "/home/sam/Dokumente/NLP_daten_p3/results"
repo_id = "slinusc/CommonVoiceSubset"
repo_type = "dataset"  # Assuming you are uploading to a dataset repository

# Upload the folder to Hugging Face
api.upload_folder(
    folder_path=folder_path,
    repo_id=repo_id,
    repo_type=repo_type,
)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HfHubHTTPError: 504 Server Error: Gateway Time-out for url: https://huggingface.co/api/datasets/slinusc/CommonVoiceSubset/commit/main

#NOW generate a dataset to use for finetuning our model

In [5]:
import os
import pandas as pd
import shutil
import json
from collections import defaultdict

# Base directory where the datasets are saved
base_dir = os.path.expanduser("~/Dokumente/NLP_daten_p3")

# Result directory where organized files will be saved
result_dir = os.path.join(base_dir, "finetuning_dataset_")

# List of languages to process
languages = ["en", "zh-TW", "fr", "ca", "de"]

# Ensure result directory exists
os.makedirs(result_dir, exist_ok=True)

# Load used speakers list
used_speakers_path = os.path.join(
    base_dir, "speaker_identification", "used_speakers.csv"
)
used_speakers_df = pd.read_csv(used_speakers_path)
used_speakers = set(used_speakers_df["speaker_id"])


# Function to process each language
def process_language(language_code):
    # Load the metadata CSV file
    metadata_path = os.path.join(base_dir, language_code, f"{language_code}_data.csv")
    df = pd.read_csv(metadata_path)

    # Filter out entries missing required information and those in used speakers list
    df = df.dropna(subset=["gender", "age"])
    df = df[~df["client_id"].isin(used_speakers)]

    # Prepare to store metadata
    metadata_list = []
    language_summary = defaultdict(lambda: {"unique_speakers": 0, "clips": 0})

    # Function to save audio files for a given gender and speaker list
    def save_audio_files(gender, speakers):
        valid_speakers = 0
        for speaker in speakers:
            speaker_data = df[df["client_id"] == speaker]

            if len(speaker_data) < 30:
                continue  # Skip speakers with less than 30 samples

            # Randomly select 30 samples if more than 30
            if len(speaker_data) > 30:
                speaker_data = speaker_data.sample(n=30, random_state=42)

            valid_speakers += 1
            for _, row in speaker_data.iterrows():
                speaker_dir = os.path.join(result_dir, f"speaker_{speaker}")
                os.makedirs(speaker_dir, exist_ok=True)
                src_path = os.path.join(base_dir, language_code, "clips", row["path"])
                dest_path = os.path.join(speaker_dir, os.path.basename(row["path"]))
                shutil.copy(src_path, dest_path)

                # Add to metadata list
                metadata_list.append(
                    {
                        "file_path": dest_path,
                        "age": row["age"],
                        "gender": row["gender"],
                        "language": language_code,
                        "unique_speaker_id": speaker,
                    }
                )

                # Update language summary
                language_summary[gender]["clips"] += 1

            language_summary[gender]["unique_speakers"] += 1

        return valid_speakers

    total_valid_speakers = 0

    # Process each gender
    for gender in ["male", "female"]:
        speakers = df[df["gender"] == gender]["client_id"].unique()[:200]
        if len(speakers) < 200:
            print(f"Warning: Not enough {gender} speakers for language {language_code}")
        valid_speakers = save_audio_files(gender, speakers)
        total_valid_speakers += valid_speakers
        print(f"Valid {gender} speakers for language {language_code}: {valid_speakers}")

    # Save metadata to JSON for each language
    with open(os.path.join(result_dir, f"{language_code}_metadata.json"), "w") as f:
        json.dump(metadata_list, f, indent=4)

    return language_summary, total_valid_speakers


# Summary for all languages
all_languages_summary = defaultdict(
    lambda: defaultdict(lambda: {"unique_speakers": 0, "clips": 0})
)
total_speakers_count = 0

# Process each language
for language in languages:
    language_summary, valid_speakers = process_language(language)
    total_speakers_count += valid_speakers
    for gender, counts in language_summary.items():
        all_languages_summary[language][gender]["unique_speakers"] += counts[
            "unique_speakers"
        ]
        all_languages_summary[language][gender]["clips"] += counts["clips"]

# Save the overall summary to JSON
with open(os.path.join(result_dir, "summary_metadata.json"), "w") as f:
    json.dump(all_languages_summary, f, indent=4)

print("Processing complete.")
print(f"Total valid speakers: {total_speakers_count}")
for language in all_languages_summary:
    for gender in all_languages_summary[language]:
        print(
            f"{language} - {gender}: {all_languages_summary[language][gender]['unique_speakers']} speakers"
        )

Valid male speakers for language en: 146
Valid female speakers for language en: 54
Valid male speakers for language zh-TW: 6
Valid female speakers for language zh-TW: 0
Valid male speakers for language fr: 186
Valid female speakers for language fr: 27
Valid male speakers for language ca: 81
Valid female speakers for language ca: 68
Valid male speakers for language de: 72
Valid female speakers for language de: 5
Processing complete.
Total valid speakers: 645
en - male: 146 speakers
en - female: 54 speakers
zh-TW - male: 6 speakers
fr - male: 186 speakers
fr - female: 27 speakers
ca - male: 81 speakers
ca - female: 68 speakers
de - male: 72 speakers
de - female: 5 speakers
