In [1]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
liepa_path = Path("../data/raw/liepa2/")

# Get all train parquet files
train_files = sorted(liepa_path.glob("train-*.parquet"))
print(f"Found {len(train_files)} training parquet files")

# Load and concatenate all training data
dfs = []
for file_path in tqdm(train_files[:1], desc="Loading parquet files"):
    df = pd.read_parquet(file_path)
    dfs.append(df)

# Combine all dataframes
full_df = pd.concat(dfs, ignore_index=True)
full_df["path"] = full_df["audio"].apply(lambda x: x["path"])
full_df.drop(columns=["audio"], inplace=True)
full_df.sort_values("path", inplace=True)

Found 130 training parquet files


Loading parquet files: 100%|██████████| 1/1 [00:00<00:00,  2.28it/s]


In [65]:
parsing_rules = [
    {"L": "lossy", "R": "raw"},
    {"R": "read", "S": "spontaneous"},
    {
        "A": "audiobook",
        "D": "dictaphone",
        "P": "phone",
        "R": "radio",
        "S": "studio",
        "T": "TV",
    },
    {"F": "female", "M": "male"},
    {"1": "0-12", "2": "13-17", "3": "18-25", "4": "26-60", "5": "60+"},
    {},
    {},
    {},
]


def parse_filename(filename):
    filename = filename[:-4]
    parts = filename.split("_")
    parts = [parts[0], parts[1][0], parts[1][1], parts[2][0], parts[2][1], *parts[3:]]
    parts_standardized = [
        parsing_rules[i].get(part, part) for i, part in enumerate(parts)
    ]
    return parts_standardized

In [66]:
full_df[
    [
        "lossiness",
        "speech_type",
        "source_type",
        "speaker_gender",
        "speaker_age",
        "speaker_id",
        "recording_id",
        "sentence_id",
    ]
] = full_df.path.apply(parse_filename).tolist()

In [67]:
filtered_df = full_df[
    (full_df["speech_type"] == "read")
    & (full_df["speaker_age"].isin(["18-25", "26-60", "60+"]))
]

In [68]:
speaker_ids = (
    filtered_df[["speaker_gender", "speaker_id"]]
    .value_counts()
    .groupby("speaker_gender")
    .head(10)
    .reset_index()["speaker_id"]
)
filtered_df = filtered_df[filtered_df["speaker_id"].isin(speaker_ids)]

In [79]:
filtered_df

Unnamed: 0,sentence,language,path,lossiness,speech_type,source_type,speaker_gender,speaker_age,speaker_id,recording_id,sentence_id
337401,kad galėtum iš tiesų pradėti gyventi prieš tai...,lt,L_RA_F4_IS031_01_000001.mp3,lossy,read,audiobook,female,26-60,IS031,01,000001
720375,Čarls Bukovski,lt,L_RA_F4_IS031_01_000002.mp3,lossy,read,audiobook,female,26-60,IS031,01,000002
627652,skiriu mamai,lt,L_RA_F4_IS031_01_000003.mp3,lossy,read,audiobook,female,26-60,IS031,01,000003
406033,antroji knygos dalis man pačiai atrodo lyg sap...,lt,L_RA_F4_IS031_01_000005.mp3,lossy,read,audiobook,female,26-60,IS031,01,000005
303663,todėl palieku spręsti jums,lt,L_RA_F4_IS031_01_000006.mp3,lossy,read,audiobook,female,26-60,IS031,01,000006
...,...,...,...,...,...,...,...,...,...,...,...
212643,jūs juokaujate,lt,R_RS_F4_VP038_04_000022.mp3,raw,read,studio,female,26-60,VP038,04,000022
403199,aš galiu tai padaryti pats,lt,R_RS_F4_VP038_04_000024.mp3,raw,read,studio,female,26-60,VP038,04,000024
734733,geros kelionės,lt,R_RS_F4_VP038_04_000026.mp3,raw,read,studio,female,26-60,VP038,04,000026
1034149,gero savaitgalio,lt,R_RS_F4_VP038_04_000027.mp3,raw,read,studio,female,26-60,VP038,04,000027


## Stressing words

In [66]:
import json
import re
import requests
import time
import numpy as np
from tqdm import tqdm

In [19]:
df_trainset = pd.read_csv("../data/processed/tts_dataset_liepa2_multispeaker/metadata.csv", sep="|", header=None, names=["path", "text", "text_normalized", "speaker_id"])

In [None]:
extract_all_words("ąžuolas, ąžuolo!")

['ąžuolas', 'ąžuolo']

In [34]:
def extract_all_words(text):
    text = re.sub(r"[^\w\s]", " ", text)
    words = text.split()
    return words

In [36]:
all_words = df_trainset["text"].apply(extract_all_words).sum()

In [47]:
all_words = sorted(set(all_words))
len(all_words)

34455

In [71]:
all_contents = []

for chunk in tqdm(np.array_split(all_words, len(all_words) // 200), desc="Processing chunks"):
    response = requests.post(
        'https://kalbu.vdu.lt/ajax-call',
        headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'},
        data={'action': 'text_accents', 'body': ' '.join(chunk)}
    )
    accented_text = response.text
    content = json.loads(response.json()["message"])["textParts"]
    all_contents.extend(content)
    time.sleep(1)

Processing chunks: 100%|██████████| 172/172 [05:49<00:00,  2.03s/it]


In [72]:
df = pd.DataFrame(all_contents)
df.to_csv("accented_words.csv", index=False)

In [73]:
df["accentType"].value_counts()

accentType
ONE                 27908
MULTIPLE_MEANING     4258
NONE                 1701
MULTIPLE_VARIANT      587
Name: count, dtype: int64