In [1]:
import pandas as pd
from pathlib import Path
import pydub
from tqdm.auto import tqdm
import numpy as np

In [2]:
abo16_list = pd.read_csv("../data/abo16_list.txt")
anmp3_dir = Path("../data/anmp3")
anmp3_dir.mkdir(parents=True, exist_ok=True)

In [3]:
abo16_list.head()

Unnamed: 0,filename,lang
0,c5008-2201151100.mp4,Seediq
1,c5008-2201150800.mp4,Thau
2,c5008-2201142000.mp4,Saysiyat
3,c5008-2201141100.mp4,Rukai
4,c5008-2201132000.mp4,Bunun


In [4]:
abo16_map = abo16_list.to_dict(orient="records")
abo16_map = {x["filename"].replace(".mp4", ""): x["lang"] for x in abo16_map}

In [11]:
lang_map = {}
pcmlen_map = {}
for path_x in Path("../data/anpcm").glob("*.pcm"):
    lang = abo16_map.get(path_x.stem, "misc")
    pcm = np.memmap(path_x, np.dtype('int16'), 'r')
    pcmlen_map[path_x.stem+".mp3"] = pcm.shape[0]
    lang_map.setdefault(lang, []).append(path_x.stem+".mp3")

In [12]:
abo16_an_splits = {"train": [], "test": []}
for lang, fnames in lang_map.items():    
    abo16_an_splits["test"].append((fnames[0], lang, pcmlen_map[fnames[0]]))
    abo16_an_splits["train"].extend([(x, lang, pcmlen_map[x]) for x in fnames[1:]])

In [13]:
len(lang_map)

17

In [14]:
[(k, len(v)) for k, v in lang_map.items()]

[('Saysiyat', 8),
 ('Thau', 8),
 ('Yami', 9),
 ('Atayal', 7),
 ('misc', 8),
 ('Rukai', 9),
 ("Hla'alua", 8),
 ('Bunun', 8),
 ('Sakizaya', 9),
 ('Kavalan', 8),
 ('Seediq', 8),
 ('Kanakanavu', 8),
 ('Truku', 8),
 ('Pinuyumayan', 9),
 ('Paiwan', 6),
 ('Amis', 8),
 ('Cou', 8)]

In [15]:
import json
with open("../data/abo16_an_splits.json", "w") as fout:
    json.dump(abo16_an_splits, fout)

## LangMap

In [16]:
lang_map

{'Saysiyat': ['c5008-2112242000.mp3',
  'c5008-2112312000.mp3',
  'c5008-2201142000.mp3',
  'c5008-2201072000.mp3',
  'c5008-2111052000.mp3',
  'c5008-2111192000.mp3',
  'c5008-2112102000.mp3',
  'c5008-2110292000.mp3'],
 'Thau': ['c5008-2201010800.mp3',
  'c5008-2201150800.mp3',
  'c5008-2111060800.mp3',
  'c5008-2112110800.mp3',
  'c5008-2112250800.mp3',
  'c5008-2201080800.mp3',
  'c5008-2111200800.mp3',
  'c5008-2112040800.mp3'],
 'Yami': ['c5008-2112282000.mp3',
  'c5008-2201042000.mp3',
  'c5008-2110262000.mp3',
  'c5008-2111232000.mp3',
  'c5008-2201112000.mp3',
  'c5008-2111022000.mp3',
  'c5008-2110192000.mp3',
  'c5008-2111162000.mp3',
  'c5008-2111092000.mp3'],
 'Atayal': ['c5008-2111081100.mp3',
  'c5008-2111011100.mp3',
  'c5008-2112131100.mp3',
  'c5008-2111291100.mp3',
  'c5008-2201031100.mp3',
  'c5008-2112271100.mp3',
  'c5008-2201101100.mp3'],
 'misc': ['misc_c5008-2201131100.mp3',
  'misc_c5008-2112311100.mp3',
  'misc_c5008-2112301100.mp3',
  'misc_c5008-2201132000.

In [17]:
with open("../data/abo16_lang_map.csv", "w") as fout:
    for k, fnames in lang_map.items():
        for fname_x in fnames:
            fout.write(f"{fname_x},{k}\n")