# Word Accentuation

In [92]:
import json
import re
import requests
import time
from collections import Counter
from functools import lru_cache
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
liepa_path = Path("../data/raw/liepa2/")

# Get all train parquet files
train_files = sorted(liepa_path.glob("train-*.parquet"))
print(f"Found {len(train_files)} training parquet files")

# Load and concatenate all training data
dfs = []
for file_path in tqdm(train_files, desc="Loading parquet files"):
    df = pd.read_parquet(file_path)
    dfs.append(df)

# Combine all dataframes
full_df = pd.concat(dfs, ignore_index=True)
full_df["path"] = full_df["audio"].apply(lambda x: x["path"])
full_df.drop(columns=["audio"], inplace=True)
full_df.sort_values("path", inplace=True)

Found 130 training parquet files


Loading parquet files: 100%|██████████| 130/130 [00:42<00:00,  3.06it/s]


## Scrape Kirčiuoklis

In [3]:
def extract_all_words(text):
    text = re.sub(r"([^\w\s]|[ʻʼ])", " ", text)
    words = text.split()
    return words

In [4]:
extract_all_words("ʻąžuolasʼ, ąžuolas - beržas !")

['ąžuolas', 'ąžuolas', 'beržas']

In [5]:
all_words = [
    word
    for text in tqdm(full_df["sentence"].tolist(), desc="Extracting words")
    for word in extract_all_words(text)
]
all_words_df = pd.Series(all_words).value_counts().to_frame("count")

Extracting words:   0%|          | 0/1132528 [00:00<?, ?it/s]

Extracting words: 100%|██████████| 1132528/1132528 [00:03<00:00, 327283.58it/s]


In [6]:
# all_words_df[all_words_df.index.str.contains(r"[^a-zA-ZąčęėįšųūžĄČĘĖĮŠŲŪŽ]", regex=True)]

In [7]:
@lru_cache(maxsize=None)
def request_accents(words):
    response = requests.post(
        "https://kalbu.vdu.lt/ajax-call",
        headers={"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8"},
        data={"action": "text_accents", "body": " ".join(words)},
    )
    accented_text = response.text
    content = json.loads(response.json()["message"])["textParts"]
    time.sleep(0.3)
    return content

In [9]:
all_contents = []
banned_words = ["užsižiops", "nepasidūs"]

for chunk in tqdm(
    np.array_split(all_words_df.index, len(all_words_df) // 200),
    desc="Processing chunks",
):
    try:
        content = request_accents(
            tuple(word for word in chunk if word not in banned_words)
        )
    except Exception as e:
        print(f"Error processing chunk: {e}")
        print(tuple(chunk))
        content = []
    all_contents.extend(content)

In [65]:
accents_df

Unnamed: 0,string,accented,accentType,type
0,ir,ir̃,ONE,WORD
1,kad,kàd,ONE,WORD
2,į,į̃,ONE,WORD
3,tai,taĩ,MULTIPLE_MEANING,WORD
4,iš,ìš,ONE,WORD
...,...,...,...,...
446455,vadovėlinius,vadovė̃linius,ONE,WORD
446456,nebesipyksime,nebesipỹksime,ONE,WORD
446457,virpčiojančiomis,vìrpčiojančiomis,ONE,WORD
446458,gailėjęsi,gailė́jęsi,ONE,WORD


In [53]:
accents_df = pd.read_csv("accented_words.csv", keep_default_na=False)
accents_df = pd.concat([accents_df, pd.DataFrame({""})])

In [11]:
# accents_df = pd.DataFrame(all_contents)
# accents_df = accents_df[accents_df["string"] != " "].reset_index(drop=True)

# accents_df.to_csv("accented_words.csv", index=False)

In [70]:
# concat columns
words_accents_df = pd.merge(
    all_words_df,
    accents_df,
    # how="left",
    left_index=True,
    right_on="string",
)

In [None]:
# words_accents_df["accentType"].value_counts()

In [85]:
# words_accents_df[words_accents_df["accentType"] == ""]

In [83]:
# words_accents_df[words_accents_df["accentType"] == "NONE"].head(20)

In [None]:
full_df[full_df["sentence"].str.contains(r"x", regex=True)]

Unnamed: 0,sentence,language,path
558124,jau buvau bepradedanti nerimauti tikėjausi kad...,lt,L_RA_F4_IS026_01_000796.mp3
1090112,Laikas prie jo tirpte tirpdavo ypač kai atsira...,lt,L_RA_M4_DK006_01_000227.mp3
211790,patys vargingiausi kokius išgyveno tas kraštas...,lt,L_RA_M4_IS016_01_000472.mp3
162746,su profesoriumi Toru Kawasaki,lt,L_RS_F5_AS019_01_000163.mp3
150258,Paskui Toru Kawasaki veža mane į Waseda univer...,lt,L_RS_F5_AS019_01_000313.mp3
93533,Kaligrafo Sugiwaros paslaptys,lt,L_RS_F5_AS019_01_000333.mp3
872633,paskui Owakidani slėnis,lt,L_RS_F5_AS019_01_001045.mp3
147641,dėvėjo snapeliu atgal apsukta New York Yankees...,lt,L_RS_M4_AS026_01_000949.mp3
573664,"Ar žinote, kuo ypatingas paketas Always Ultra",lt,R_RD_F3_AS115_01_000032.mp3
535564,"Jis teigia, kad pirmasis visiems priimtiną kul...",lt,R_RD_F4_MS010_01_000111.mp3


In [102]:
words_accents_df.groupby("accentType")["count"].sum()

accentType
                        275
MULTIPLE_MEANING     977437
MULTIPLE_VARIANT      66246
NONE                 230003
ONE                 5402357
Name: count, dtype: int64

In [112]:
# words_accents_df["accentType"].value_counts()

In [138]:
words_accents_df["accented"] = words_accents_df["accented"].str.translate(
    {ord("̃"): "~", ord("̀"): "`", ord("́"): "´"}
)

In [141]:
# Counter("".join(words_accents_df["accented"]))

In [159]:
final_accents_df = words_accents_df[words_accents_df["accentType"] == "ONE"][
    ["string", "accented"]
]
final_accents_df.to_csv(liepa_path / "final_accented_words.csv", index=False)