<u>**Response to Task 5(b)**</u>

In [1]:
import sys
from pathlib import Path

PROJECT_DIR = Path.home() / "work/htx-xdata"  # TODO change this to the path of your repo
TASK_DIR = PROJECT_DIR / "asr-train"
src_dir = TASK_DIR / "src"

if src_dir.as_posix() not in sys.path:
    sys.path.insert(0, src_dir.as_posix())
# NOTE: You may also want to add `"python.analysis.extraPaths": ["./asr-train/src"]` to your VSCode workspace

In [202]:
import numpy as np
import pandas as pd
import torch
from app.config import pth_valid_dev_raw
from InstructorEmbedding import INSTRUCTOR
from sklearn.metrics.pairwise import cosine_similarity
from utils_ds import get_df_valid_dev

pd.options.display.max_colwidth = 500

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = INSTRUCTOR("hkunlp/instructor-large")
model.to(device)

In [107]:
df_dev = get_df_valid_dev()
df_dev.dropna(subset=["generated_text_finetuned"], inplace=True)

In [129]:
instruction_qry = "Represent the Speech Transcript sentence:"
instruction_doc = "Represent the Speech Transcript sentence for retrieval:"

# instruction_qry = "Represent the sentence for retrieving supporting speech transcripts:"
# instruction_doc = "Represent the speech transcript for retrieval:"
# instruction_qry = "Represent the Safety sentence for retrieving supporting speech transcripts:"
# instruction_doc = "Represent the Safety speech transcript for retrieval:"
# instruction_qry = "Represent the Safety sentence for retrieving supporting speech transcripts with cautionary, destructive, or suspicious meanings:"
# instruction_doc = "Represent the Safety speech transcript for retrieval with cautionary, destructive, or suspicious meanings:"
# instruction_qry = "Represent the Safety question for retrieving speech transcripts with cautionary, destructive, or suspicious meanings (accounting for transcription errors):"
# instruction_doc = "Represent the Safety speech transcript for retrieval with cautionary, destructive, or suspicious meanings (accounting for transcription errors):"

In [155]:
query = [
    [instruction_qry, qry]
    for qry in [
        "Take caution",  # BE CAREFUL
        "Seek to destroy",  # DESTROY
        "Stranger is present",  # STRANGER
    ]
]
query_embeddings = model.encode(query)

corpus = [[instruction_doc, transcript.lower()] for transcript in df_dev["generated_text_finetuned"]]
corpus_embeddings = model.encode(corpus)

similarities = cosine_similarity(query_embeddings, corpus_embeddings)

In [None]:
## [EDA] tuning the threshold (and the instructions)
hotword_to_threshold = {
    "BE CAREFUL": 0.83,
    "DESTROY": 0.83,
    "STRANGER": 0.85,
}

df_detecteds = []

DEBUG = False
# DEBUG = True # TODO disable
TGT_QRY_IDX = 0
for qry_idx, (hotword, threshold) in enumerate(hotword_to_threshold.items()):
    if DEBUG and qry_idx != TGT_QRY_IDX:
        continue

    scores = similarities[qry_idx]
    sorted_indices = np.argsort(scores)  # Sort scores ascending
    sorted_scores = scores[sorted_indices]
    sorted_docs = df_dev.iloc[sorted_indices]  # Sort DataFrame rows accordingly

    # Find the closest index where scores cross the threshold
    above_idx = np.searchsorted(sorted_scores, threshold, side="left")

    # Get 3 documents before and after the threshold
    start_idx = max(0, above_idx - 3)
    end_idx = min(len(sorted_scores), above_idx + 3)

    # Note down the detected documents
    df_detected = df_dev.iloc[sorted_indices[above_idx:]].assign(score=sorted_scores[above_idx:])
    df_detecteds.append(df_detected)

    if DEBUG and qry_idx == TGT_QRY_IDX:
        break


df_debug = df_dev.assign(score=scores)
display(df_debug.iloc[sorted_indices[start_idx:end_idx]])

In [None]:
# # [debug]
# filenames = ["cv-valid-dev/sample-001791.mp3", "cv-valid-dev/sample-001440.mp3"]
# df_debug.query(f"filename in @filenames")

In [None]:
# # [debug]
# for hotword, df_detected in zip(hotword_to_threshold, df_detecteds):
#     print(f"Detected {hotword}:")
#     display(df_detected)

In [227]:
filenames = pd.concat([df_detected[["filename"]] for df_detected in df_detecteds])["filename"].unique()
df_dev_out = df_dev.drop(columns=["stats"]).assign(similarity=lambda df: df["filename"].isin(filenames))
df_orig = pd.read_csv(pth_valid_dev_raw.with_suffix(".csv.bak"))
df_dev_out = df_orig.merge(
    df_dev_out[["filename", "generated_text", "generated_text_finetuned", "label", "similarity"]],
    how="left",
    on="filename",
)

In [230]:
## Write
df_dev_out.to_csv(pth_valid_dev_raw, index=False)