<u>**Response to Task 5(a)**</u>
1. Assume that hotword detection is asking to detect the **intended** words the speaker uttered (though he/she may have uttered with an accent / make a mistake vocalizing it, OR there was an transcription mistake).
1. Therefore, being able to tolerate errors in the transcript is paramount.
1. Without going into complex manual post-ASR correction techniques (e.g. passing into yet another Language Model to 'correct' the transcription, or estimating phoneme from grapheme), my proposal uses Fuzzy RegEx as a quick and dirty way to detect such hotwords.

In [1]:
import sys
from pathlib import Path

PROJECT_DIR = Path.home() / "work/htx-xdata"  # TODO change this to the path of your repo
TASK_DIR = PROJECT_DIR / "asr-train"
src_dir = TASK_DIR / "src"


if src_dir.as_posix() not in sys.path:
    sys.path.insert(0, src_dir.as_posix())
# NOTE: You may also want to add `"python.analysis.extraPaths": ["./asr-train/src"]` to your VSCode workspace

In [None]:
import pandas as pd
import regex as re
from app.config import pth_hotwords_txt, pth_valid_dev_raw
from utils_ds import get_df_valid_dev

pd.options.display.max_colwidth = 100

In [3]:
# pattern with up to 1 edits (insertions, deletions, or substitutions)
RGX_hotword_w_1edits = re.compile(r"\b(?:BE CAREFUL|DESTROY|STRANGER){e<=1}\b", re.IGNORECASE)
# NOTE: Assume we are NOT interested in past tense e.g. 'was careful', 'destroyed'.

# Known false positives
KNOWN_BLACKLIST = {"STRANGE", "STRONGER"}


def detect_hotwords_fuzzy(txt: str) -> re.Match:
    """Detect hotwords in the text `txt` using fuzzy matching with up to 1 edits."""
    # TODO For cases where the hotword is very short (e.g. "NO"), we may want to use pure exact match instead.
    # In our case, the shortest hotword is "DESTROY". 1 edits out of 7 characters is still roughly acceptable (~14%),
    # but expect some false positives and corner cases... May need to maintain blacklist of false positives.
    return RGX_hotword_w_1edits.search(txt)


def postprocess_fuzzy(mtc: re.Match) -> str:
    """Postprocess the transcribed text `txt` to correct common errors."""
    if mtc:
        txt = mtc.group()
        if txt.strip().upper() in KNOWN_BLACKLIST:
            return None
        return txt
    return None


def test_detect_hotwords_fuzzy():
    """Unit test..."""
    txt_n_should_match_tpls = [
        ("You must be carful", True),  # Misspelled "be careful" 1 edit
        ("You mustbe carful", False),  # Misspelled "be careful" but partial-word match
        ("You must be carful", True),  # Misspelled "be careful" 1 edits
        ("destroyy", True),  # Extra character
        ("strangr", True),  # Missing 'e'
        ("You must be careful of the Dark.", True),  # Exact match
        ("random text", False),
    ]

    for txt, should_match in txt_n_should_match_tpls:
        mtc = RGX_hotword_w_1edits.search(txt)
        if mtc:
            if should_match:
                continue
            print(f"[failed case;YES]: '{txt}': '{mtc.group()}'")
        else:
            if not should_match:
                continue
            print(f"[failed case;NO] : '{txt}'")
            # break

In [4]:
df_dev = get_df_valid_dev(pth_valid_dev_raw)
# case where clip is totally silent -> no transcription
df_dev.fillna({"generated_text_finetuned": ""}, inplace=True)

In [5]:
df_dev["mtc"] = df_dev["generated_text_finetuned"].apply(detect_hotwords_fuzzy)
df_dev["hotword"] = df_dev["mtc"].apply(postprocess_fuzzy)

In [6]:
# ## [EDA] Regex matches
# df_debug = df_dev.dropna(subset=['mtc'])
# df_debug = df_debug[['filename', 'text', 'generated_text_finetuned', 'mtc', 'stats']]
# df_debug

In [None]:
# ## [EDA] hotwords
# df_debug = df_dev.dropna(subset=['hotword'])
# df_debug = df_debug[['filename', 'text', 'generated_text_finetuned', 'mtc', 'hotword', 'stats']]
# df_debug

In [None]:
# Write
df_dev.query("hotword.notna()")["filename"].to_csv(pth_hotwords_txt, index=False, header=False)