In [205]:
import io
import pandas as pd
from pathlib import Path
from tinytag import TinyTag
from tqdm import tqdm

In [206]:
def get_duration(row):
    tag = TinyTag.get(file_obj=io.BytesIO(row["audio"]["bytes"]))
    return tag.duration

In [215]:
liepa_path = Path("../data/raw/liepa2/")

# Get all train parquet files
train_files = sorted(liepa_path.glob("train-*.parquet"))
print(f"Found {len(train_files)} training parquet files")

# Load and concatenate all training data
dfs = []
for file_path in tqdm(train_files, desc="Loading parquet files"):
    df = pd.read_parquet(file_path)
    df["duration"] = df.apply(get_duration, axis=1)
    df["path"] = df["audio"].apply(lambda x: x["path"])
    df.drop(columns=["audio"], inplace=True)
    dfs.append(df)

# Combine all dataframes
full_df = pd.concat(dfs, ignore_index=True)
full_df["sentence_len"] = full_df["sentence"].apply(len)
full_df.sort_values("path", inplace=True)

Found 130 training parquet files


Loading parquet files:   0%|          | 0/130 [00:00<?, ?it/s]

Loading parquet files: 100%|██████████| 130/130 [00:59<00:00,  2.19it/s]


In [216]:
parsing_rules = [
    {"L": "lossy", "R": "raw"},
    {"R": "read", "S": "spontaneous"},
    {
        "A": "audiobook",
        "D": "dictaphone",
        "P": "phone",
        "R": "radio",
        "S": "studio",
        "T": "TV",
    },
    {"F": "female", "M": "male"},
    {"1": "0-12", "2": "13-17", "3": "18-25", "4": "26-60", "5": "60+"},
    {},
    {},
    {},
]


def parse_filename(filename):
    filename = filename[:-4]
    parts = filename.split("_")
    parts = [parts[0], parts[1][0], parts[1][1], parts[2][0], parts[2][1], *parts[3:]]
    parts_standardized = [
        parsing_rules[i].get(part, part) for i, part in enumerate(parts)
    ]
    return parts_standardized

In [217]:
full_df[
    [
        "lossiness",
        "speech_type",
        "source_type",
        "speaker_gender",
        "speaker_age",
        "speaker_id",
        "recording_id",
        "sentence_id",
    ]
] = full_df.path.apply(parse_filename).tolist()

In [218]:
filtered_df = full_df[
    (full_df["speech_type"] == "read")
    & (full_df["speaker_age"].isin(["18-25", "26-60", "60+"]))
]

In [254]:
speaker_counts_df = (
    filtered_df
    # calculate total count and duration per speaker
    .groupby(["speaker_gender", "speaker_id"])
    .agg(total_utterances=("sentence", "count"), total_duration=("duration", "sum"))
    .reset_index()
    .sort_values(by=["total_duration"], ascending=False)
)


def select_speakers(speaker_counts_df, n_per_gender=10):
    return speaker_counts_df.groupby("speaker_gender").head(n_per_gender)

In [None]:
selected_speakers = {}

for n_per_gender in [15, 30, 90]:
    selected_speakers_df = select_speakers(speaker_counts_df, n_per_gender)
    selected_speakers[n_per_gender] = sorted(
        selected_speakers_df["speaker_id"].to_list()
    )

    assert all(
        set(selected_speakers[n_per_gender]).issuperset(other_selected_speakers)
        for _, other_selected_speakers in selected_speakers.items()
    )
    print(f"\n=== n_per_gender = {n_per_gender} ===")
    print(
        f"  Minimum count of utterances for selected speakers: "
        f"{selected_speakers_df['total_utterances'].min()}"
    )
    print(
        f"  Minimum total duration (s) for selected speakers: "
        f"{selected_speakers_df['total_duration'].min()}"
    )


=== n_per_gender = 15 ===
  Minimum count of utterances for selected speakers: 718
  Minimum total duration (s) for selected speakers: 2817.9312

=== n_per_gender = 30 ===
  Minimum count of utterances for selected speakers: 409
  Minimum total duration (s) for selected speakers: 2527.8368

=== n_per_gender = 90 ===
  Minimum count of utterances for selected speakers: 409
  Minimum total duration (s) for selected speakers: 1962.4496


In [None]:
def sample_duration_per_speaker(df, selected_speakers, seconds_per_speaker):
    selected_df = df[df["speaker_id"].isin(selected_speakers)]
    sampled_dfs = []
    for speaker_id in selected_speakers:
        speaker_df = selected_df[selected_df["speaker_id"] == speaker_id]
        speaker_df = speaker_df.sample(
            frac=1, random_state=42
        )  # shuffle rows for randomness
        cumulative_duration = (
            speaker_df["duration"].cumsum() - speaker_df["duration"]
        )  # include one more utterance
        speaker_sampled_df = speaker_df[cumulative_duration <= seconds_per_speaker]
        sampled_dfs.append(speaker_sampled_df)
    return pd.concat(sampled_dfs, ignore_index=True)

In [295]:
for n_per_gender in [15, 30, 90]:
    print(f"\n=== n_per_gender = {n_per_gender} ===")
    duration_per_speaker = 22.5 * 3600 / (n_per_gender * 2)
    sample_df = sample_duration_per_speaker(
        filtered_df, selected_speakers[n_per_gender], duration_per_speaker
    )
    print(f"  Total sampled duration (minutes): {sample_df['duration'].sum() / 60:.3f}")
    print(
        f"  Min/max duration per speaker (minutes): "
        f"{sample_df.groupby('speaker_id')['duration'].sum().min() / 60:.2f} / "
        f"{sample_df.groupby('speaker_id')['duration'].sum().max() / 60:.2f}"
    )
    sample_df.to_csv(f"liepa_selected_{n_per_gender * 2}_speakers.csv", index=False)


=== n_per_gender = 15 ===
  Total sampled duration (minutes): 1350.790
  Min/max duration per speaker (minutes): 45.00 / 45.10

=== n_per_gender = 30 ===
  Total sampled duration (minutes): 1351.553
  Min/max duration per speaker (minutes): 22.50 / 22.63

=== n_per_gender = 90 ===
  Total sampled duration (minutes): 1354.763
  Min/max duration per speaker (minutes): 7.50 / 7.61
