In [2]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [95]:
liepa_path = Path("../data/raw/liepa2/")

# Get all train parquet files
train_files = sorted(liepa_path.glob("train-*.parquet"))
print(f"Found {len(train_files)} training parquet files")

# Load and concatenate all training data
dfs = []
for file_path in tqdm(train_files, desc="Loading parquet files"):
    df = pd.read_parquet(file_path)
    dfs.append(df)

# Combine all dataframes
full_df = pd.concat(dfs, ignore_index=True)
full_df["path"] = full_df["audio"].apply(lambda x: x["path"])
full_df.drop(columns=["audio"], inplace=True)
full_df.sort_values("path", inplace=True)

Found 130 training parquet files


Loading parquet files: 100%|██████████| 130/130 [00:41<00:00,  3.14it/s]


In [96]:
parsing_rules = [
    {"L": "lossy", "R": "raw"},
    {"R": "read", "S": "spontaneous"},
    {
        "A": "audiobook",
        "D": "dictaphone",
        "P": "phone",
        "R": "radio",
        "S": "studio",
        "T": "TV",
    },
    {"F": "female", "M": "male"},
    {"1": "0-12", "2": "13-17", "3": "18-25", "4": "26-60", "5": "60+"},
    {},
    {},
    {},
]


def parse_filename(filename):
    filename = filename[:-4]
    parts = filename.split("_")
    parts = [parts[0], parts[1][0], parts[1][1], parts[2][0], parts[2][1], *parts[3:]]
    parts_standardized = [
        parsing_rules[i].get(part, part) for i, part in enumerate(parts)
    ]
    return parts_standardized

In [97]:
full_df[
    [
        "lossiness",
        "speech_type",
        "source_type",
        "speaker_gender",
        "speaker_age",
        "speaker_id",
        "recording_id",
        "sentence_id",
    ]
] = full_df.path.apply(parse_filename).tolist()

In [98]:
filtered_df = full_df[
    (full_df["speech_type"] == "read")
    & (full_df["speaker_age"].isin(["18-25", "26-60", "60+"]))
]

In [99]:
speaker_ids = (
    filtered_df[["speaker_gender", "speaker_id"]]
    .value_counts()
    .groupby("speaker_gender")
    .head(10)
    .reset_index()["speaker_id"]
)
filtered_df = filtered_df[filtered_df["speaker_id"].isin(speaker_ids)]

In [113]:
# filtered_df