In [3]:
import argparse
import math
import os
import re
import sys
from typing import List, Tuple, Dict, Optional

import numpy as np
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode

# RapidFuzz is preferred over fuzzywuzzy for speed and licensing
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors


# ---------------------------
# Default data locations
# ---------------------------
DEFAULT_RESOLVED_QUERIES = "data/resolved_queries.csv"
DEFAULT_NEW_QUERIES = "data/new_queries.csv"

DEFAULT_BASE_NAMES = "data/base_names.csv"
DEFAULT_NAME_VARIATIONS = "data/name_variations.csv"


# ---------------------------
# Helpers: Text normalization
# ---------------------------
WS_RE = re.compile(r"\s+")
PUNCT_RE = re.compile(r"[^\w\s]")  # keep alphanumerics and underscore
COMMA_SPLIT_RE = re.compile(r"\s*,\s*")

HONORIFICS = {
    "mr", "mrs", "ms", "miss", "dr", "prof", "sir", "madam", "mx",
    "jr", "sr", "ii", "iii", "iv", "v"
}

def normalize_text_generic(text: str) -> str:
    """
    Generic normalization for queries:
    - ASCII fold
    - Lowercase
    - Collapse whitespace
    - Strip
    """
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    t = unidecode(text).lower().strip()
    t = WS_RE.sub(" ", t)
    return t


def normalize_text_alnum(text: str) -> str:
    """
    For fuzzy methods that benefit from punctuation removal.
    """
    t = normalize_text_generic(text)
    t = PUNCT_RE.sub(" ", t)
    t = WS_RE.sub(" ", t).strip()
    return t


def normalize_name(text: str) -> str:
    """
    Name-specific normalization:
    - ASCII fold, lowercase
    - Remove honorifics and suffixes
    - Handle "Last, First Middle" -> "First Middle Last"
    - Remove punctuation (retain spaces)
    - Normalize initials: "J. R. R." -> "j r r"
    - Collapse multiple spaces
    """
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    t = unidecode(text).lower().strip()

    # If contains a comma, treat as "last, first ..."
    if "," in t:
        parts = [p.strip() for p in COMMA_SPLIT_RE.split(t) if p.strip()]
        if len(parts) >= 2:
            # Move last name to the end
            t = " ".join(parts[1:] + [parts[0]])

    # Remove punctuation
    t = PUNCT_RE.sub(" ", t)

    # Remove honorifics/suffixes
    tokens = [tok for tok in t.split() if tok not in HONORIFICS]
    # Normalize initials by removing periods already handled; keep as single letters
    t = " ".join(tokens)
    t = WS_RE.sub(" ", t).strip()
    return t


def sorted_token_form(text: str) -> str:
    """
    Token sort canonicalization (useful for token_set/token_sort ratios).
    """
    toks = [tok for tok in text.split() if tok]
    toks.sort()
    return " ".join(toks)


# ---------------------------
# RapidFuzz matching utilities
# ---------------------------
RAPIDFUZZ_SCORERS = {
    "ratio": fuzz.ratio,
    "partial_ratio": fuzz.partial_ratio,
    "token_sort_ratio": fuzz.token_sort_ratio,
    "token_set_ratio": fuzz.token_set_ratio,
    "WRatio": fuzz.WRatio,
}

def rapidfuzz_topk(
    queries: List[str],
    candidates: List[str],
    method: str,
    topk: int = 3,
    score_cutoff: int = 0,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Compute top-k RapidFuzz matches for each query against candidates with a given scorer.

    Returns:
      indices: shape (len(queries), topk) indices into candidates (-1 if missing)
      scores:  shape (len(queries), topk) similarity scores (0..100)
    """
    scorer = RAPIDFUZZ_SCORERS[method]
    n = len(queries)
    idx_mat = np.full((n, topk), -1, dtype=int)
    score_mat = np.zeros((n, topk), dtype=float)

    for i, q in enumerate(tqdm(queries, desc=f"RapidFuzz:{method}")):
        # process.extract returns list of (candidate, score, index)
        matches = process.extract(q, candidates, scorer=scorer, limit=topk, score_cutoff=score_cutoff)
        for j, tup in enumerate(matches):
            # tup can be (string, score, index)
            cand_idx = tup[2]
            cand_score = tup[1]
            idx_mat[i, j] = cand_idx
            score_mat[i, j] = cand_score
    return idx_mat, score_mat


def suggest_threshold_from_gaps(top1: np.ndarray, top2: np.ndarray, hard_max: float, gap_margin: float) -> float:
    """
    Heuristic threshold suggestion without ground truth:
      - Use 95th percentile of top2 as a proxy for strong-but-incorrect matches
      - Use 60th percentile of top1 as a proxy for typical true matches
      - Add a small margin
    Final threshold = max(q95(top2) + margin, q60(top1))
    """
    if len(top1) == 0:
        return 0.0
    q95_top2 = np.nanpercentile(top2, 95)
    q60_top1 = np.nanpercentile(top1, 60)
    thr = max(q95_top2 + gap_margin, q60_top1)
    thr = max(0.0, min(hard_max, thr))
    return float(thr)


def method_recommendation(score_mat: np.ndarray) -> Dict[str, float]:
    """
    Compute statistics to help method/threshold selection:
    - median top1 score
    - median top2 score
    - median confidence gap (top1 - top2)
    """
    top1 = score_mat[:, 0]
    top2 = score_mat[:, 1] if score_mat.shape[1] > 1 else np.zeros_like(top1)
    med_top1 = np.nanmedian(top1)
    med_top2 = np.nanmedian(top2)
    med_gap = np.nanmedian(top1 - top2)
    return {
        "median_top1": float(med_top1),
        "median_top2": float(med_top2),
        "median_gap": float(med_gap),
    }


# ---------------------------
# TF-IDF + cosine utilities
# ---------------------------
def fit_tfidf_vectorizer(
    corpus_ref: List[str],
    analyzer: str = "char_wb",
    ngram_range: Tuple[int, int] = (3, 5),
    min_df: int = 1,
    max_df: float = 1.0,
) -> TfidfVectorizer:
    vec = TfidfVectorizer(
        analyzer=analyzer,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        lowercase=False,  # we already lowercased in normalization
        norm="l2",
    )
    vec.fit(corpus_ref)
    return vec


def tfidf_topk(
    vec: TfidfVectorizer,
    queries: List[str],
    candidates: List[str],
    topk: int = 3,
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Use NearestNeighbors on TF-IDF vectors to get top-k cosine-based matches.
    Returns:
      indices shape (n_queries, topk)
      sims    shape (n_queries, topk), in [0,1]
    """
    X_cand = vec.transform(candidates)
    X_query = vec.transform(queries)

    # Nearest neighbors on cosine distance, n_neighbors=topk
    nn = NearestNeighbors(n_neighbors=min(topk, X_cand.shape[0]), metric="cosine", algorithm="brute")
    nn.fit(X_cand)
    distances, indices = nn.kneighbors(X_query)
    # Convert cosine distance to cosine similarity
    sims = 1.0 - distances
    return indices, sims


# ---------------------------
# Data loading helpers
# ---------------------------
def load_csv_guess_text_column(path: str, preferred_cols: List[str]) -> Tuple[pd.DataFrame, str]:
    """
    Load CSV and guess which column holds the text to match.
    preferred_cols: in order of preference, we'll take the first that exists.
    Fallback: first object dtype column.
    Returns: (df, chosen_column_name)
    """
    df = pd.read_csv(path)
    cols_lower = {c.lower(): c for c in df.columns}
    for c in preferred_cols:
        if c.lower() in cols_lower:
            col = cols_lower[c.lower()]
            return df, col
    # fallback to first string-like column
    for c in df.columns:
        if df[c].dtype == object:
            return df, c
    # if none found, force-cast first column
    return df, df.columns[0]


def ensure_id_column(df: pd.DataFrame, base_name: str = "id") -> pd.DataFrame:
    """
    Ensure an 'id' column exists for output. If not, create a sequential one.
    """
    cols_lower = [c.lower() for c in df.columns]
    if base_name in cols_lower:
        return df
    df = df.copy()
    df["id"] = np.arange(len(df))
    return df


# ---------------------------
# Main pipelines
# ---------------------------
def task1_match_queries(
    resolved_csv: str,
    new_csv: str,
    out_dir: str,
    topk: int = 3,
) -> None:
    os.makedirs(out_dir, exist_ok=True)
    print("\n=== Task 1: Matching new queries to resolved queries ===")

    # Load data
    resolved_df, resolved_col = load_csv_guess_text_column(resolved_csv, preferred_cols=["query", "resolved_query", "text", "question"])
    new_df, new_col = load_csv_guess_text_column(new_csv, preferred_cols=["query", "new_query", "text", "question"])
    resolved_df = ensure_id_column(resolved_df, base_name="id")
    new_df = ensure_id_column(new_df, base_name="id")

    # Normalize text
    resolved_df["_norm"] = resolved_df[resolved_col].map(normalize_text_alnum)
    new_df["_norm"] = new_df[new_col].map(normalize_text_alnum)

    resolved_texts = resolved_df["_norm"].fillna("").tolist()
    new_texts = new_df["_norm"].fillna("").tolist()

    # A. RapidFuzz with multiple scorers
    rf_results = {}
    rf_stats = {}
    for method in ["token_set_ratio", "token_sort_ratio", "partial_ratio", "ratio", "WRatio"]:
        idx_mat, score_mat = rapidfuzz_topk(new_texts, resolved_texts, method=method, topk=topk)
        rf_results[method] = (idx_mat, score_mat)
        rf_stats[method] = method_recommendation(score_mat)

        # Save detailed CSV
        rows = []
        for i in range(len(new_df)):
            for j in range(topk):
                ridx = idx_mat[i, j]
                score = score_mat[i, j]
                if ridx < 0:
                    continue
                rows.append({
                    "new_id": new_df.iloc[i]["id"],
                    "new_query": new_df.iloc[i][new_col],
                    "resolved_id": resolved_df.iloc[ridx]["id"],
                    "resolved_query": resolved_df.iloc[ridx][resolved_col],
                    "method": method,
                    "rank": j + 1,
                    "score": score,
                })
        out_path = os.path.join(out_dir, f"task1_fuzzy_{method}.csv")
        pd.DataFrame(rows).to_csv(out_path, index=False)
        print(f"- Saved RapidFuzz matches ({method}) -> {out_path}")

    # Suggest best method based on median confidence gap
    method_scores = {m: rf_stats[m]["median_gap"] for m in rf_stats}
    best_rf_method = max(method_scores, key=method_scores.get)
    best_idx, best_score = rf_results[best_rf_method]
    best_top1 = best_score[:, 0]
    best_top2 = best_score[:, 1] if best_score.shape[1] > 1 else np.zeros_like(best_top1)
    rf_threshold = suggest_threshold_from_gaps(best_top1, best_top2, hard_max=100.0, gap_margin=5.0)

    print("\nRapidFuzz method statistics (Task 1):")
    for m in rf_stats:
        s = rf_stats[m]
        print(f"  {m:16s} median_top1={s['median_top1']:.1f} median_top2={s['median_top2']:.1f} median_gap={s['median_gap']:.1f}")
    print(f"=> Recommended RapidFuzz method: {best_rf_method} with threshold ~= {rf_threshold:.1f} (0-100 scale)")

    # Produce a consolidated "accepted" CSV for the recommended RF method
    accepted_rows = []
    for i in range(len(new_df)):
        ridx = best_idx[i, 0]
        s1 = best_score[i, 0]
        s2 = best_score[i, 1] if best_score.shape[1] > 1 else 0.0
        accepted_rows.append({
            "new_id": new_df.iloc[i]["id"],
            "new_query": new_df.iloc[i][new_col],
            "resolved_id": resolved_df.iloc[ridx]["id"] if ridx >= 0 else None,
            "resolved_query": resolved_df.iloc[ridx][resolved_col] if ridx >= 0 else None,
            "method": best_rf_method,
            "score_top1": s1,
            "score_top2": s2,
            "confidence_gap": s1 - s2,
            "accepted_match": bool(s1 >= rf_threshold),
        })
    rf_accept_path = os.path.join(out_dir, f"task1_fuzzy_{best_rf_method}_accepted.csv")
    pd.DataFrame(accepted_rows).to_csv(rf_accept_path, index=False)
    print(f"- Saved recommended RF accepted matches -> {rf_accept_path}")

    # B. TF-IDF + cosine (char n-grams are robust for short queries)
    vec = fit_tfidf_vectorizer(
        corpus_ref=resolved_texts,
        analyzer="char_wb",
        ngram_range=(3, 5),
        min_df=1,
        max_df=1.0,
    )
    tf_idx, tf_sims = tfidf_topk(vec, new_texts, resolved_texts, topk=topk)
    # Save detailed CSV
    rows = []
    for i in range(len(new_df)):
        for j in range(tf_idx.shape[1]):
            ridx = tf_idx[i, j]
            sim = tf_sims[i, j]
            rows.append({
                "new_id": new_df.iloc[i]["id"],
                "new_query": new_df.iloc[i][new_col],
                "resolved_id": resolved_df.iloc[ridx]["id"],
                "resolved_query": resolved_df.iloc[ridx][resolved_col],
                "method": "tfidf_char_wb_3_5",
                "rank": j + 1,
                "cosine_similarity": sim,
            })
    tfidf_out = os.path.join(out_dir, "task1_tfidf_char.csv")
    pd.DataFrame(rows).to_csv(tfidf_out, index=False)
    print(f"- Saved TF-IDF matches -> {tfidf_out}")

    # Threshold suggestion for TF-IDF
    tf_top1 = tf_sims[:, 0]
    tf_top2 = tf_sims[:, 1] if tf_sims.shape[1] > 1 else np.zeros_like(tf_top1)
    tf_threshold = suggest_threshold_from_gaps(tf_top1, tf_top2, hard_max=1.0, gap_margin=0.05)

    tf_accepted_rows = []
    for i in range(len(new_df)):
        ridx = tf_idx[i, 0]
        s1 = tf_sims[i, 0]
        s2 = tf_sims[i, 1] if tf_sims.shape[1] > 1 else 0.0
        tf_accepted_rows.append({
            "new_id": new_df.iloc[i]["id"],
            "new_query": new_df.iloc[i][new_col],
            "resolved_id": resolved_df.iloc[ridx]["id"],
            "resolved_query": resolved_df.iloc[ridx][resolved_col],
            "method": "tfidf_char_wb_3_5",
            "cosine_top1": s1,
            "cosine_top2": s2,
            "confidence_gap": s1 - s2,
            "accepted_match": bool(s1 >= tf_threshold),
        })
    tf_accept_path = os.path.join(out_dir, "task1_tfidf_char_accepted.csv")
    pd.DataFrame(tf_accepted_rows).to_csv(tf_accept_path, index=False)
    print(f"=> Recommended TF-IDF cosine threshold ~= {tf_threshold:.2f} (0-1 scale)")
    print(f"- Saved recommended TF-IDF accepted matches -> {tf_accept_path}")

    print("\nNotes:")
    print("- For query matching, token_set_ratio is often robust to reordering and extra tokens.")
    print("- Char-level TF-IDF (3-5) tends to work well for short queries with typos/variants.")
    print("- The suggested thresholds are heuristic; adjust after spot-checking some pairs.")


def task2_match_names(
    base_names_csv: str,
    name_variations_csv: str,
    out_dir: str,
    topk: int = 3,
) -> None:
    os.makedirs(out_dir, exist_ok=True)
    print("\n=== Task 2: Matching names with variations ===")

    # Load data
    base_df, base_col = load_csv_guess_text_column(base_names_csv, preferred_cols=["name", "full_name"])
    var_df, var_col = load_csv_guess_text_column(name_variations_csv, preferred_cols=["name", "full_name"])
    base_df = ensure_id_column(base_df, base_name="id")
    var_df = ensure_id_column(var_df, base_name="id")

    # Normalize names
    base_df["_norm"] = base_df[base_col].map(normalize_name)
    var_df["_norm"] = var_df[var_col].map(normalize_name)
    # Also prepare token-sorted form to help token_sort/set ratios
    base_df["_norm_sorted"] = base_df["_norm"].map(sorted_token_form)
    var_df["_norm_sorted"] = var_df["_norm"].map(sorted_token_form)

    base_norm = base_df["_norm"].fillna("").tolist()
    var_norm = var_df["_norm"].fillna("").tolist()
    base_norm_sorted = base_df["_norm_sorted"].fillna("").tolist()
    var_norm_sorted = var_df["_norm_sorted"].fillna("").tolist()

    # A. RapidFuzz on normalized and token-sorted forms
    rf_results = {}
    rf_stats = {}

    # For names, token_set_ratio often performs best due to order invariance and duplicate handling
    # We'll compute scores on the sorted token form to stabilize ratios
    candidates = base_norm_sorted
    queries = var_norm_sorted

    for method in ["token_set_ratio", "token_sort_ratio", "partial_ratio", "ratio", "WRatio"]:
        idx_mat, score_mat = rapidfuzz_topk(queries, candidates, method=method, topk=topk)
        rf_results[method] = (idx_mat, score_mat)
        rf_stats[method] = method_recommendation(score_mat)

        # Save detailed CSV
        rows = []
        for i in range(len(var_df)):
            for j in range(topk):
                ridx = idx_mat[i, j]
                score = score_mat[i, j]
                if ridx < 0:
                    continue
                rows.append({
                    "variant_id": var_df.iloc[i]["id"],
                    "variant_name": var_df.iloc[i][var_col],
                    "base_id": base_df.iloc[ridx]["id"],
                    "base_name": base_df.iloc[ridx][base_col],
                    "method": method,
                    "rank": j + 1,
                    "score": score,
                })
        out_path = os.path.join(out_dir, f"task2_fuzzy_{method}.csv")
        pd.DataFrame(rows).to_csv(out_path, index=False)
        print(f"- Saved RapidFuzz name matches ({method}) -> {out_path}")

    method_scores = {m: rf_stats[m]["median_gap"] for m in rf_stats}
    best_rf_method = max(method_scores, key=method_scores.get)
    best_idx, best_score = rf_results[best_rf_method]
    best_top1 = best_score[:, 0]
    best_top2 = best_score[:, 1] if best_score.shape[1] > 1 else np.zeros_like(best_top1)
    rf_threshold = suggest_threshold_from_gaps(best_top1, best_top2, hard_max=100.0, gap_margin=5.0)

    print("\nRapidFuzz method statistics (Task 2):")
    for m in rf_stats:
        s = rf_stats[m]
        print(f"  {m:16s} median_top1={s['median_top1']:.1f} median_top2={s['median_top2']:.1f} median_gap={s['median_gap']:.1f}")
    print(f"=> Recommended RapidFuzz method (names): {best_rf_method} with threshold ~= {rf_threshold:.1f} (0-100 scale)")

    # Consolidated accepted CSV for recommended method
    accepted_rows = []
    for i in range(len(var_df)):
        ridx = best_idx[i, 0]
        s1 = best_score[i, 0]
        s2 = best_score[i, 1] if best_score.shape[1] > 1 else 0.0
        accepted_rows.append({
            "variant_id": var_df.iloc[i]["id"],
            "variant_name": var_df.iloc[i][var_col],
            "base_id": base_df.iloc[ridx]["id"] if ridx >= 0 else None,
            "base_name": base_df.iloc[ridx][base_col] if ridx >= 0 else None,
            "method": best_rf_method,
            "score_top1": s1,
            "score_top2": s2,
            "confidence_gap": s1 - s2,
            "accepted_match": bool(s1 >= rf_threshold),
        })
    rf_accept_path = os.path.join(out_dir, f"task2_fuzzy_{best_rf_method}_accepted.csv")
    pd.DataFrame(accepted_rows).to_csv(rf_accept_path, index=False)
    print(f"- Saved recommended RF accepted name matches -> {rf_accept_path}")

    # B. TF-IDF for names: char n-grams (2,4) capture short tokens and initials
    vec = fit_tfidf_vectorizer(
        corpus_ref=base_norm,
        analyzer="char_wb",
        ngram_range=(2, 4),
        min_df=1,
        max_df=1.0,
    )
    tf_idx, tf_sims = tfidf_topk(vec, var_norm, base_norm, topk=topk)

    rows = []
    for i in range(len(var_df)):
        for j in range(tf_idx.shape[1]):
            ridx = tf_idx[i, j]
            sim = tf_sims[i, j]
            rows.append({
                "variant_id": var_df.iloc[i]["id"],
                "variant_name": var_df.iloc[i][var_col],
                "base_id": base_df.iloc[ridx]["id"],
                "base_name": base_df.iloc[ridx][base_col],
                "method": "tfidf_char_wb_2_4",
                "rank": j + 1,
                "cosine_similarity": sim,
            })
    tfidf_out = os.path.join(out_dir, "task2_tfidf_char.csv")
    pd.DataFrame(rows).to_csv(tfidf_out, index=False)
    print(f"- Saved TF-IDF name matches -> {tfidf_out}")

    tf_top1 = tf_sims[:, 0]
    tf_top2 = tf_sims[:, 1] if tf_sims.shape[1] > 1 else np.zeros_like(tf_top1)
    tf_threshold = suggest_threshold_from_gaps(tf_top1, tf_top2, hard_max=1.0, gap_margin=0.05)

    tf_accepted_rows = []
    for i in range(len(var_df)):
        ridx = tf_idx[i, 0]
        s1 = tf_sims[i, 0]
        s2 = tf_sims[i, 1] if tf_sims.shape[1] > 1 else 0.0
        tf_accepted_rows.append({
            "variant_id": var_df.iloc[i]["id"],
            "variant_name": var_df.iloc[i][var_col],
            "base_id": base_df.iloc[ridx]["id"],
            "base_name": base_df.iloc[ridx][base_col],
            "method": "tfidf_char_wb_2_4",
            "cosine_top1": s1,
            "cosine_top2": s2,
            "confidence_gap": s1 - s2,
            "accepted_match": bool(s1 >= tf_threshold),
        })
    tf_accept_path = os.path.join(out_dir, "task2_tfidf_char_accepted.csv")
    pd.DataFrame(tf_accepted_rows).to_csv(tf_accept_path, index=False)
    print(f"=> Recommended TF-IDF cosine threshold (names) ~= {tf_threshold:.2f} (0-1 scale)")
    print(f"- Saved recommended TF-IDF accepted name matches -> {tf_accept_path}")

    print("\nNotes:")
    print("- For names, token_set_ratio on token-sorted normalized names is typically best.")
    print("- Char TF-IDF with (2,4) n-grams works well for initials and short tokens.")
    print("- Consider post-filters: same first letter, matching last token, or length ratio within 0.6-1.6 for higher precision.")


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Fuzzy and TF-IDF matching for queries and names")
    # Task 1
    p.add_argument("--resolved_csv", type=str, default=DEFAULT_RESOLVED_QUERIES, help="Resolved queries CSV (path or URL)")
    p.add_argument("--new_csv", type=str, default=DEFAULT_NEW_QUERIES, help="New queries CSV (path or URL)")
    # Task 2
    p.add_argument("--base_names_csv", type=str, default=DEFAULT_BASE_NAMES, help="Base names CSV (path or URL)")
    p.add_argument("--name_variations_csv", type=str, default=DEFAULT_NAME_VARIATIONS, help="Name variations CSV (path or URL)")
    # General
    p.add_argument("--output_dir", type=str, default="outputs_matching", help="Directory to save outputs")
    p.add_argument("--topk", type=int, default=3, help="Top-k matches to save per item")
    return p.parse_known_args()[0]


def main():
    args, unknown = parse_args(), None
    print("Config:")
    print(f"- Resolved CSV: {args.resolved_csv}")
    print(f"- New CSV:      {args.new_csv}")
    print(f"- Base names:   {args.base_names_csv}")
    print(f"- Variations:   {args.name_variations_csv}")
    print(f"- Output dir:   {args.output_dir}")
    print(f"- Top-k:        {args.topk}")

    # Task 1
    task1_dir = os.path.join(args.output_dir, "task1")
    task1_match_queries(
        resolved_csv=args.resolved_csv,
        new_csv=args.new_csv,
        out_dir=task1_dir,
        topk=args.topk,
    )

    # Task 2
    task2_dir = os.path.join(args.output_dir, "task2")
    task2_match_names(
        base_names_csv=args.base_names_csv,
        name_variations_csv=args.name_variations_csv,
        out_dir=task2_dir,
        topk=args.topk,
    )

    print("\nDone.")


if __name__ == "__main__":
    main()

Config:
- Resolved CSV: data/resolved_queries.csv
- New CSV:      data/new_queries.csv
- Base names:   data/base_names.csv
- Variations:   data/name_variations.csv
- Output dir:   outputs_matching
- Top-k:        3

=== Task 1: Matching new queries to resolved queries ===


RapidFuzz:token_set_ratio: 100%|██████████| 20/20 [00:00<00:00, 6605.20it/s]


- Saved RapidFuzz matches (token_set_ratio) -> outputs_matching/task1/task1_fuzzy_token_set_ratio.csv


RapidFuzz:token_sort_ratio: 100%|██████████| 20/20 [00:00<00:00, 41262.21it/s]


- Saved RapidFuzz matches (token_sort_ratio) -> outputs_matching/task1/task1_fuzzy_token_sort_ratio.csv


RapidFuzz:partial_ratio: 100%|██████████| 20/20 [00:00<00:00, 14508.14it/s]


- Saved RapidFuzz matches (partial_ratio) -> outputs_matching/task1/task1_fuzzy_partial_ratio.csv


RapidFuzz:ratio: 100%|██████████| 20/20 [00:00<00:00, 128266.18it/s]


- Saved RapidFuzz matches (ratio) -> outputs_matching/task1/task1_fuzzy_ratio.csv


RapidFuzz:WRatio: 100%|██████████| 20/20 [00:00<00:00, 33447.40it/s]


- Saved RapidFuzz matches (WRatio) -> outputs_matching/task1/task1_fuzzy_WRatio.csv

RapidFuzz method statistics (Task 1):
  token_set_ratio  median_top1=75.2 median_top2=41.7 median_gap=33.9
  token_sort_ratio median_top1=70.5 median_top2=40.6 median_gap=29.4
  partial_ratio    median_top1=66.7 median_top2=46.4 median_gap=20.3
  ratio            median_top1=70.9 median_top2=41.1 median_gap=30.2
  WRatio           median_top1=76.9 median_top2=42.5 median_gap=30.7
=> Recommended RapidFuzz method: token_set_ratio with threshold ~= 84.1 (0-100 scale)
- Saved recommended RF accepted matches -> outputs_matching/task1/task1_fuzzy_token_set_ratio_accepted.csv
- Saved TF-IDF matches -> outputs_matching/task1/task1_tfidf_char.csv
=> Recommended TF-IDF cosine threshold ~= 0.81 (0-1 scale)
- Saved recommended TF-IDF accepted matches -> outputs_matching/task1/task1_tfidf_char_accepted.csv

Notes:
- For query matching, token_set_ratio is often robust to reordering and extra tokens.
- Char-level TF-

RapidFuzz:token_set_ratio: 100%|██████████| 100/100 [00:00<00:00, 68111.46it/s]


- Saved RapidFuzz name matches (token_set_ratio) -> outputs_matching/task2/task2_fuzzy_token_set_ratio.csv


RapidFuzz:token_sort_ratio: 100%|██████████| 100/100 [00:00<00:00, 209296.61it/s]


- Saved RapidFuzz name matches (token_sort_ratio) -> outputs_matching/task2/task2_fuzzy_token_sort_ratio.csv


RapidFuzz:partial_ratio: 100%|██████████| 100/100 [00:00<00:00, 74778.11it/s]


- Saved RapidFuzz name matches (partial_ratio) -> outputs_matching/task2/task2_fuzzy_partial_ratio.csv


RapidFuzz:ratio: 100%|██████████| 100/100 [00:00<00:00, 350694.31it/s]


- Saved RapidFuzz name matches (ratio) -> outputs_matching/task2/task2_fuzzy_ratio.csv


RapidFuzz:WRatio: 100%|██████████| 100/100 [00:00<00:00, 49246.26it/s]


- Saved RapidFuzz name matches (WRatio) -> outputs_matching/task2/task2_fuzzy_WRatio.csv

RapidFuzz method statistics (Task 2):
  token_set_ratio  median_top1=100.0 median_top2=45.5 median_gap=53.8
  token_sort_ratio median_top1=100.0 median_top2=45.5 median_gap=53.8
  partial_ratio    median_top1=100.0 median_top2=53.0 median_gap=41.2
  ratio            median_top1=100.0 median_top2=45.5 median_gap=53.8
  WRatio           median_top1=100.0 median_top2=45.5 median_gap=52.4
=> Recommended RapidFuzz method (names): token_set_ratio with threshold ~= 100.0 (0-100 scale)
- Saved recommended RF accepted name matches -> outputs_matching/task2/task2_fuzzy_token_set_ratio_accepted.csv
- Saved TF-IDF name matches -> outputs_matching/task2/task2_tfidf_char.csv
=> Recommended TF-IDF cosine threshold (names) ~= 1.00 (0-1 scale)
- Saved recommended TF-IDF accepted name matches -> outputs_matching/task2/task2_tfidf_char_accepted.csv

Notes:
- For names, token_set_ratio on token-sorted normalized name