# Keyword Frequency Threshold Analysis & Visualization

This notebook analyzes and visualizes optimal frequency thresholds for keyword-based category prediction. It helps determine which minimum frequency cutoff maximizes classification accuracy while reducing noise.

## Analysis
- Loads scored transactions and calculates performance metrics across thresholds
- Generates multiple visualization types (line graphs, bar charts, etc.)
- Tests different minimum frequency requirements for keywords
- Reports TP, FP, FN, TN for each threshold
- Identifies optimal threshold for balanced performance

In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter


In [2]:
all_df = pd.read_excel("debit_txn_v5.xlsx")

In [3]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
all_df["narr_norm"] = all_df["Narration"].apply(normalize)

In [5]:
kw_df = pd.read_excel("keyword_freq_by_category_debit_v5.xlsx")

In [6]:
kw_df['keyword'] = kw_df['keyword'].str.lower().str.strip()

In [7]:
kw_df['score_sq'] = kw_df['score'] ** 2
# kw_df['score_sq'] = np.where(kw_df['score'] < 10, 0, kw_df['score'] ** 2)

In [8]:
kw_lookup = (
    kw_df
    .set_index(['keyword', 'category'])['score_sq']
    .to_dict()
)


In [9]:
categories = kw_df['category'].unique()

In [10]:
all_df['tokens'] = (
    all_df['narr_norm']
    .fillna('')
    .str.lower()
    .str.split()
)


In [11]:
for cat in categories:
    all_df[cat] = all_df['tokens'].apply(
        lambda toks: sum(
            kw_lookup.get((t, cat), 0) for t in toks
        )
    )


In [12]:
all_df.drop(columns=['tokens'], inplace=True)


In [13]:
for cat in categories:
    all_df[cat] = np.sqrt(all_df[cat].fillna(0))


In [14]:
all_df['predicted_category'] = (
    all_df[categories]
    .idxmax(axis=1)
    .where(all_df[categories].max(axis=1) > 0)
)


In [15]:
# all_df.to_excel(
#    "credit_category_score_v5.xlsx",
#     index=False
# )


In [16]:
all_df['Category'].unique()

array(['Personal Expense', nan, 'Business Expense - Food Expense',
       'Transfer to Demat',
       'Business Expense - Travelling and Conveyance',
       'Internal Bank Transfers',
       'Business Expense - Printing and Stationary', 'Medical Expenses',
       'Business Expense - Telephone and Internet Charges',
       'Investment in Gold and Silver', 'Business Expense - Bank Charges',
       'Purchase of Mutual Fund', 'TDS Paid', 'Cash Withdrawal',
       'Rent Payments', 'Business Expense - Electricity Expense',
       'Income Tax Paid', 'EMIs paid', 'Investment in Recurring Deposit',
       'Investment in Fixed Deposit', 'Credit Card Payment',
       'Business Expense - Labour Charges',
       'Business Expense - Promotion',
       'Business Expense - Advertisement and Marketing',
       'Business Expense - Salary Paid',
       'Business Expense - Transportation Charges', 'Investment in PPF',
       'Business Expense - Bank/Loan Interest', 'Municipal Tax Paid',
       'Investment

In [17]:
# for ref_category in categories:

#     TP = (
#         (all_df["Category"] == ref_category) &
#         (all_df["predicted_category"] == ref_category)
#     ).sum()

#     FN = (
#         (all_df["Category"] == ref_category) &
#         (all_df["predicted_category"] != ref_category)
#     ).sum()

#     FP = (
#         (all_df["Category"] != ref_category) &
#         (all_df["predicted_category"] == ref_category)
#     ).sum()

#     TN = (
#         (all_df["Category"] != ref_category) &
#         (all_df["predicted_category"] != ref_category)
#     ).sum()

#     print("Category:", ref_category)
#     print("TP:", TP)
#     print("FN:", FN)
#     print("FP:", FP)
#     print("TN:", TN)
#     print("-" * 40)


In [19]:
count = (
    all_df['Category']
    .str.strip().str.lower()
    .isin([c.strip().lower() for c in categories])
    .sum()
)

print(count)

N_VALID=count

113484


In [20]:
def accuracy_for_freq(freq_threshold):
    # --- filter kw_df ---
    kw_filt = kw_df[kw_df["freq"] >= freq_threshold]
    if kw_filt.empty:
        return np.nan

    # --- build (category → keyword → score²) ---
    kw_map = {}
    for cat, g in kw_filt.groupby("category"):
        kw_map[cat.strip().lower()] = dict(
            zip(
                g["keyword"].str.lower(),
                g["score"] ** 2
            )
        )

    categories = list(kw_map.keys())

    # --- compute scores ---
    scores = []
    for narr in all_df["narr_norm"].fillna("").str.lower():
        tokens = set(narr.split())
        row_scores = {}
        for cat in categories:
            s = 0
            for kw, sc in kw_map[cat].items():
                if kw in tokens:
                    s += sc
            row_scores[cat] = s
        scores.append(row_scores)

    score_df = pd.DataFrame(scores).fillna(0)

    # --- predicted category ---
    all_df["_pred"] = score_df.idxmax(axis=1).str.lower()

    # --- correctness check (case-insensitive) ---
    correct = (
        (all_df["_pred"] == all_df['Category'].str.strip().str.lower())
    ).sum()

    accuracy = (correct * 100) / N_VALID
    return accuracy


In [None]:
freq_thresholds = sorted(
    kw_df.loc[kw_df["freq"] < 200, "freq"].unique()
)


results = []

for i, freq_threshold in enumerate(freq_thresholds, start=1):
    print(f"\n[{i}/{len(freq_thresholds)}] Processing freq_threshold = {freq_threshold}")

    acc = accuracy_for_freq(freq_threshold)

    print(f"    → accuracy returned: {acc}")

    results.append({
        "freq_threshold": freq_threshold,
        "accuracy": acc
    })

    if i % 5 == 0:
        print("    ✓ checkpoint reached")

result_df = pd.DataFrame(results)
print("\n✅ result_df created successfully")
print(result_df.head())




[1/186] Processing freq_threshold = 11


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(
    result_df["freq_threshold"],
    result_df["accuracy"],
    marker="o"
)

plt.xlabel("Minimum Keyword Frequency Threshold")
plt.ylabel("Accuracy (%)")
plt.title("Accuracy vs Keyword Frequency Threshold")
plt.grid(True)
plt.show()
