This notebook is to prepare the `training-report.pdf` for task 4

### Imports and globals

In [51]:
import os
import sys
from pathlib import Path

PROJECT_DIR = Path.home() / "work/htx-xdata"  # TODO change this to the path of your repo
TASK_DIR = PROJECT_DIR / "asr-train"
src_dir = TASK_DIR / "src"


if src_dir.as_posix() not in sys.path:
    sys.path.insert(0, src_dir.as_posix())
# NOTE: You may also want to add `"python.analysis.extraPaths": ["./asr-train/src"]` to your VSCode workspace

In [52]:
import json
import logging
import re
import shlex
from functools import partial
from io import BytesIO
from pathlib import Path
from subprocess import check_output
from typing import List, Tuple

import evaluate
import numpy as np
import pandas as pd
import torch
from app.config import (  # pth_df_wer_test,
    best_checkpoint_dir,
    ds_dev_dir,
    ds_test_dir,
    ds_train_dir,
    ds_val_dir,
    final_model_dir,
    pth_df_wer_dev,
    pth_valid_dev_raw,
    pth_valid_test_raw,
    pth_valid_train_raw,
    sampling_rate,
    valid_dev_raw_dir,
    valid_test_raw_dir,
    valid_train_raw_dir,
)
from app.model import ASRModel
from datasets import Dataset, load_dataset
from pydub import AudioSegment
from sklearn.model_selection import train_test_split
from utils_ds import (
    array_to_audio,
    backup_file,
    disp_audio,
    get_df_valid_dev,
    get_df_valid_test,
    get_df_valid_train,
    get_df_wer,
    get_df_wer_dev,
    get_ds_cur_chunk_dir,
    get_ds_fingerprint_chunk,
    load_ds_from_disk,
    preprocess_text,
    save_best_model,
)
from utils_train import CommonVoiceDataLoader, batch_predict, count_parameters

pd.options.display.max_colwidth = 300

In [15]:
## Load dev dataset
ds_dev = Dataset.load_from_disk(ds_dev_dir)

## Load dev set predictions dataframe (or predict if not available)

df_dev = get_df_valid_dev()

# case: No prediction yet -> predict
if "generated_text_finetuned" not in df_dev:

    asr = ASRModel(pth_model=final_model_dir)
    dev_cv_dataloader = CommonVoiceDataLoader(ds_dev, asr.processor, batch_size=32)
    dev_loader = dev_cv_dataloader.get_dataloader(shuffle=False)

    backup_file(pth_valid_dev_raw)
    df_dev_raw = pd.read_csv(pth_valid_dev_raw)

    preds, labels, filenames = batch_predict(dev_loader, asr.model, asr.processor, asr.device)
    df_wip = pd.DataFrame(
        {
            "filename": filenames,
            "generated_text_finetuned": preds,
            "label": labels,
        }
    )
    df_dev_raw_w_preds = df_dev_raw.merge(df_wip, on="filename", how="left")

    # Write
    df_dev_raw_w_preds.to_csv(pth_valid_dev_raw, index=False)

    # reload
    df_dev = get_df_valid_dev()

In [16]:
## Compute WER for dev set
# [result] WER: (original, finetuned) = (0.11030, 0.14030)

RUN = False
if RUN:
    wer_metric = evaluate.load("wer")
    wer = wer_metric.compute(predictions=df_dev["generated_text"], references=df_dev["label"])
    wer_finetuned = wer_metric.compute(predictions=df_dev["generated_text_finetuned"], references=df_dev["label"])
    print(f"WER: (original, finetuned) = ({wer:.5f}, {wer_finetuned:.5f})")
del RUN

In [None]:
# Compute the WER for both the original and finetuned models
df_wer_dev = get_df_wer_dev(pth_df_wer_dev, df_dev, ds_dev)
df_wer_dev.rename(
    columns={
        "generated_text": "pred_old",
        "generated_text_finetuned": "pred_new",
        "wer": "wer_old",
        "wer_finetuned": "wer_new",
    },
    inplace=True,
)
cols = ["filename", "stats", "text", "label", "pred_old", "pred_new", "wer_old", "wer_new"]
df_wer_dev = df_wer_dev.reindex(columns=cols)
df_wer_dev["wer_diff"] = df_wer_dev["wer_new"] - df_wer_dev["wer_old"]

In [None]:
cond_same_wer = df_wer_dev["wer_new"] == df_wer_dev["wer_old"]
cond_new_better = df_wer_dev["wer_new"] < df_wer_dev["wer_old"]
cond_new_worse = df_wer_dev["wer_new"] > df_wer_dev["wer_old"]

cond_old_0wer = df_wer_dev["wer_old"] == 0

n_same_0wer = (cond_same_wer & cond_old_0wer).sum()
n_same_wer_nonzero = (cond_same_wer & ~cond_old_0wer).sum()
n_new_better = cond_new_better.sum()
n_new_worse = cond_new_worse.sum()

pct_same_0wer = n_same_0wer / df_wer_dev.shape[0]
pct_same_wer_nonzero = n_same_wer_nonzero / df_wer_dev.shape[0]
pct_new_better = n_new_better / df_wer_dev.shape[0]
pct_new_worse = n_new_worse / df_wer_dev.shape[0]


print(
    f"Both fully-correct (0 WER): {n_same_0wer:12d} ({pct_same_0wer:06.2%})"
    "\n"
    f"Both same non-zero WER    : {n_same_wer_nonzero:12d} ({pct_same_wer_nonzero:06.2%})"
    "\n"
    f"New model better          : {n_new_better:12d} ({pct_new_better:06.2%})"
    "\n"
    f"New model worse           : {n_new_worse:12d} ({pct_new_worse:06.2%})"
)

# # [results]
# Both fully-correct (0 WER):         1713 (42.04%)
# Both same non-zero WER    :          941 (23.09%)
# New model better          :          376 (09.23%)
# New model worse           :         1045 (25.64%)

#### [New model worse] Analyze largest WER difference

In [31]:
df_debug = df_wer_dev.sort_values("wer_diff", ascending=False).query("wer_diff > 0")

In [None]:
i_row = 7
row = df_debug.iloc[i_row]
display(row.to_frame().T)
disp_audio(df_debug.iloc[i_row]["filename"])

In [None]:
from app.config import pth_analyze_largest_wer_diff

df_analyze = pd.read_csv(pth_analyze_largest_wer_diff)
display(df_analyze)

In [19]:
# for x in df_debug['filename'].iloc[:20]:
#     print(f"1. `{x}` ->")

#### [New model better] Analyze largest WER difference

In [50]:
df_debug = df_wer_dev.sort_values("wer_diff", ascending=True).query("wer_diff < 0")

1. `cv-valid-dev/sample-001346.mp3` -> (US; US; male)
1. `cv-valid-dev/sample-003047.mp3` -> (US; US; male). 20+s audio had additional conversation which was picked up fairly well by the original model, but is not the provided ground truth.
1. `cv-valid-dev/sample-002782.mp3` -> (US; US; male). Model was able to predict correct locations of `'` and the word "invasion". 20+s audio with additional conversation.
1. `cv-valid-dev/sample-001352.mp3` -> (non-US; UK; male). Model is able to predict "I'm" instead of "I AM" by the original model.
1. `cv-valid-dev/sample-001015.mp3` -> (non-US; UK; male). Model was able to get "NOT" and "SO" correct over the original model.
1. `cv-valid-dev/sample-000606.mp3` -> (non-US; unk; male). Model was able to predict "I'm" and correct words.
1. `cv-valid-dev/sample-000723.mp3` -> (non-US; unk; male). Model was able to predict "I'll".
1. `cv-valid-dev/sample-002313.mp3` -> (non-US; unk; male). Model was able to predict "LET'S" (original predicted "LAT'S").
1. `cv-valid-dev/sample-004001.mp3` -> (non-US; UK; male). Proper English audio. Model was able to predict correctly.
1. `cv-valid-dev/sample-003659.mp3` -> (non-US; unk; male). Muffled audio, heavily accented. Model was able to predict correctly.

In [None]:
i_row = 9
row = df_debug.iloc[i_row]
display(row.to_frame().T)
disp_audio(df_debug.iloc[i_row]["filename"])

#### [KIV] Manually analyzing random samples

#### Misc explorations

In [None]:
def explore_accent_classifier():
    """Just a quick exploration to do accent classification"""
    from utils_experimental import AccentClassifier

    clf = AccentClassifier("pretrained_models/accent-id-commonaccent_xlsr-en-english")
    filenames = df_dev.sample(10, random_state=42)["filename"]
    df_eda = df_dev.query("filename in @filenames").copy()
    pths = df_eda["filename"].apply(lambda x: valid_dev_raw_dir.parent / x).tolist()
    preds = clf.batch_predict_accent(pths)
    df_eda["accent_pred"] = preds
    display(df_eda)