In [91]:
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [95]:
liepa_path = Path("../data/raw/liepa2/")

# Get all train parquet files
train_files = sorted(liepa_path.glob("train-*.parquet"))
print(f"Found {len(train_files)} training parquet files")

# Load and concatenate all training data
dfs = []
for file_path in tqdm(train_files, desc="Loading parquet files"):
    df = pd.read_parquet(file_path)
    dfs.append(df)

# Combine all dataframes
full_df = pd.concat(dfs, ignore_index=True)
full_df["path"] = full_df["audio"].apply(lambda x: x["path"])
full_df.drop(columns=["audio"], inplace=True)
full_df.sort_values("path", inplace=True)

Found 130 training parquet files


Loading parquet files: 100%|██████████| 130/130 [00:41<00:00,  3.14it/s]


In [96]:
parsing_rules = [
    {"L": "lossy", "R": "raw"},
    {"R": "read", "S": "spontaneous"},
    {
        "A": "audiobook",
        "D": "dictaphone",
        "P": "phone",
        "R": "radio",
        "S": "studio",
        "T": "TV",
    },
    {"F": "female", "M": "male"},
    {"1": "0-12", "2": "13-17", "3": "18-25", "4": "26-60", "5": "60+"},
    {},
    {},
    {},
]


def parse_filename(filename):
    filename = filename[:-4]
    parts = filename.split("_")
    parts = [parts[0], parts[1][0], parts[1][1], parts[2][0], parts[2][1], *parts[3:]]
    parts_standardized = [
        parsing_rules[i].get(part, part) for i, part in enumerate(parts)
    ]
    return parts_standardized

In [97]:
full_df[
    [
        "lossiness",
        "speech_type",
        "source_type",
        "speaker_gender",
        "speaker_age",
        "speaker_id",
        "recording_id",
        "sentence_id",
    ]
] = full_df.path.apply(parse_filename).tolist()

In [98]:
filtered_df = full_df[
    (full_df["speech_type"] == "read")
    & (full_df["speaker_age"].isin(["18-25", "26-60", "60+"]))
]

In [99]:
speaker_ids = (
    filtered_df[["speaker_gender", "speaker_id"]]
    .value_counts()
    .groupby("speaker_gender")
    .head(10)
    .reset_index()["speaker_id"]
)
filtered_df = filtered_df[filtered_df["speaker_id"].isin(speaker_ids)]

In [113]:
# filtered_df

## Stressing words

In [None]:
import json
import re
import requests
import time
from functools import lru_cache

import numpy as np
from tqdm import tqdm

In [138]:
def extract_all_words(text):
    text = re.sub(r"([^\w\s]|[ʻʼ])", " ", text)
    words = text.split()
    return words

In [139]:
extract_all_words("ʻąžuolasʼ, ąžuolas - beržas !")

['ąžuolas', 'ąžuolas', 'beržas']

In [None]:
all_words = [
    word
    for text in tqdm(full_df["sentence"].tolist(), desc="Extracting words")
    for word in extract_all_words(text)
]
all_words_df = pd.Series(all_words).value_counts().to_frame("count")

Extracting words: 100%|██████████| 1132528/1132528 [00:03<00:00, 326602.58it/s]


In [155]:
# all_words_df[all_words_df.index.str.contains(r"[^a-zA-ZąčęėįšųūžĄČĘĖĮŠŲŪŽ]", regex=True)]

In [182]:
@lru_cache(maxsize=None)
def request_accents(words):
    response = requests.post(
        "https://kalbu.vdu.lt/ajax-call",
        headers={"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"},
        data={"action": "text_accents", "body": " ".join(words)},
    )
    accented_text = response.text
    content = json.loads(response.json()["message"])["textParts"]
    time.sleep(0.3)
    return content

In [196]:
all_contents = []
banned_words = ["užsižiops", "nepasidūs"]

for chunk in tqdm(np.array_split(all_words_df.index, len(all_words_df) // 200), desc="Processing chunks"):
    try:
        content = request_accents(tuple(word for word in chunk if word not in banned_words))
    except Exception as e:
        print(f"Error processing chunk: {e}")
        print(tuple(chunk))
        content = []
    all_contents.extend(content)

Processing chunks: 100%|██████████| 2232/2232 [00:00<00:00, 2827.70it/s] 


In [205]:
accents_df = pd.DataFrame(all_contents)
accents_df = accents_df[accents_df["string"] != " "].reset_index(drop=True)

accents_df.to_csv("accented_words.csv", index=False)

In [206]:
# concat columns
words_accents_df = pd.merge(
    all_words_df,
    accents_df,
    left_index=True,
    right_on="string",
)

In [207]:
words_accents_df["accentType"].value_counts()

accentType
ONE                 321643
NONE                 82992
MULTIPLE_MEANING     33766
MULTIPLE_VARIANT      7967
Name: count, dtype: int64

In [218]:
words_accents_df[words_accents_df["accentType"] == "NONE"].head(20)

Unnamed: 0,count,string,accented,accentType,type
87,6933,kat,,NONE,WORD
122,4968,dvidešim,,NONE,WORD
207,2915,trisdešim,,NONE,WORD
224,2707,vat,,NONE,WORD
281,2142,keturiasdešim,,NONE,WORD
358,1638,penkiasdešim,,NONE,WORD
496,1217,i,,NONE,WORD
546,1132,dešim,,NONE,WORD
567,1094,pa,,NONE,WORD
569,1090,šešiasdešim,,NONE,WORD


In [221]:
full_df[full_df["sentence"].str.contains(r"\bpa\b", regex=True)]

Unnamed: 0,sentence,language,path,lossiness,speech_type,source_type,speaker_gender,speaker_age,speaker_id,recording_id,sentence_id
263480,dabar būnu nes Telšiuose dabar yra parašyta ma...,lt,L_RS_F4_MS349_01_000204.mp3,lossy,read,studio,female,26-60,MS349,01,000204
53135,valia buvo pa pažeista ar ne tai,lt,L_RS_F4_MS350_01_000401.mp3,lossy,read,studio,female,26-60,MS350,01,000401
636855,daugiausiai aišku tiek jų pa patirtys istorinė...,lt,L_RS_F4_MS350_01_000474.mp3,lossy,read,studio,female,26-60,MS350,01,000474
1044864,archainio pa pasaulėvaizdžio reliktai,lt,L_RS_F5_IS443_01_000393.mp3,lossy,read,studio,female,60+,IS443,01,000393
151470,ūpas toks pa čian čia šalia vat kaip tik ir ar...,lt,L_RS_F5_MS347_01_000044.mp3,lossy,read,studio,female,60+,MS347,01,000044
...,...,...,...,...,...,...,...,...,...,...,...
216576,o gal tu mane gali pa pakal,lt,R_SS_F4_IM006_01_000152.mp3,raw,spontaneous,studio,female,26-60,IM006,01,000152
325595,pakviestųjų buvo ir mano tėvai ir mes kartu te...,lt,R_SS_F4_IM006_01_000191.mp3,raw,spontaneous,studio,female,26-60,IM006,01,000191
489151,bet matyt gal buvo pa pakankamai,lt,R_SS_F4_IM006_01_000391.mp3,raw,spontaneous,studio,female,26-60,IM006,01,000391
862154,nusipirksi galbūt tik trečdalį to pa to paties...,lt,R_SS_F4_IM035_01_000534.mp3,raw,spontaneous,studio,female,26-60,IM035,01,000534


In [204]:
words_accents_df.groupby("accentType")["count"].sum()

accentType
MULTIPLE_MEANING     977437
MULTIPLE_VARIANT      66246
NONE                 230003
ONE                 5402357
Name: count, dtype: int64