In [1]:
import numpy as np
import json
import os

import wandb

from decimal import Decimal
from glob import glob
from datasets import Dataset, Audio, load_dataset
from tqdm import tqdm

In [None]:
DATASET_NAME = "mozilla-foundation/common_voice_11_0"
NAME = "cs"
SPLIT = "test"
SAMPLING_RATE = 16_000

In [None]:
hf_dataset = load_dataset(DATASET_NAME, NAME, split=SPLIT)
hf_dataset = hf_dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))

In [None]:
wav_lenghts = np.array([len(example['audio']['array']) for example in tqdm(hf_dataset.to_iterable_dataset(), total=len(hf_dataset))])

In [None]:
wav_lenghts_secs = wav_lenghts/SAMPLING_RATE

In [None]:
np.mean(wav_lenghts_secs)

In [None]:
total_secs = np.sum(wav_lenghts_secs)
total_secs, total_secs/3600

Training table sumarisation

In [None]:
model = "wav2vec2" # wav2vec2 / t5
train_paths = glob(f'/home/sulcm/models/{model}/*')

In [None]:
def format_latex_table(results: dict, best_results: dict) -> str:
    formated_table = ''
    for run, metrics in results.items():
        formated_table += run + ' & ' + ' & '.join([f'\\textbf{{{v}}}' if (m in best_results and run == best_results[m][0]) else str(v) for m, v in metrics.items()]) + ' \\\\\n'
    
    return formated_table

In [None]:
metrics = ['wer', 'cer']
best_results = dict.fromkeys(metrics, ('', 1.0))
table_prep = {}

wandb_api = wandb.Api()
runs = wandb_api.runs("sulcm/huggingface")
run_params = ["learning_rate", "batch_size", "max_steps"]

for train_res in train_paths:
    path2res = train_res + '/all_results.json'
    if not os.path.exists(path2res):
        continue

    with open(path2res, 'r') as f:
        results = json.load(f)
    
    table_prep[train_res.split('/')[-1].split('-')[-1]] = {m: f'{100.0*results[f"eval_{m}"]:.02f}' for m in metrics}
    for run in runs:
        if run.state != "finished":
            continue
        if run.config["run_name"].split("/")[-2] != train_res.split('/')[-1]:
            continue
        run_params_dict = {}
        for param in run_params:
            if param == "batch_size":
                run_params_dict[param] = run.config["per_device_train_batch_size"] * run.config["gradient_accumulation_steps"]
            elif param == "learning_rate":
                run_params_dict[param] = "%.2e" % Decimal(run.config[param])
            else:
                run_params_dict[param] = run.config[param]
        table_prep[train_res.split('/')[-1].split('-')[-1]].update(run_params_dict)
        break
        

    for metric in metrics:
        if best_results[metric][1] > results[f"eval_{metric}"]:
            best_results[metric] = (train_res.split('/')[-1].split('-')[-1], results[f"eval_{metric}"])

print(table_prep)
print(best_results)

In [None]:
print(format_latex_table(table_prep, best_results))