# Token-Based Keyword Matching & Category Prediction

This notebook implements token-based keyword matching using exact word matching. It tokenizes narrations and looks up matching keywords in a lookup table to assign category scores.

## Methodology
1. Load transaction data and normalize narrations
2. Tokenize normalized narrations by whitespace
3. Build efficient lookup table for keyword-category-score mappings
4. Calculate scores for each category by summing matching keyword scores
5. Predict categories based on highest scores
6. Evaluate performance with confusion matrix for each category

In [None]:
# Import required libraries
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations
import re            # Text processing
from collections import Counter  # Count frequencies

In [None]:
# Load credit transaction data
all_df = pd.read_excel("credit_txn_v6.xlsx")

In [None]:
def normalize(text):
    """Normalize narration text for consistent processing"""
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
# Apply normalization to all narrations
all_df["narr_norm"] = all_df["Narration"].apply(normalize)

In [None]:
# Load keyword frequency data with category labels and scores
kw_df = pd.read_excel("keyword_freq_by_category_credit_v5.xlsx")

In [None]:
# Normalize and clean keywords
kw_df['keyword'] = kw_df['keyword'].str.lower().str.strip()

In [None]:
# Square scores to emphasize high-frequency keywords
kw_df['score_sq'] = kw_df['score'] ** 2

In [None]:
# Build efficient lookup table: (keyword, category) -> score
# This enables O(1) lookups during scoring
kw_lookup = (
    kw_df
    .set_index(['keyword', 'category'])['score_sq']
    .to_dict()
)

In [None]:
# Tokenize normalized narrations by splitting on whitespace
all_df['tokens'] = (
    all_df['narr_norm']
    .fillna('')
    .str.lower()
    .str.split()
)

In [None]:
# Extract unique categories
categories = kw_df['category'].unique()

In [None]:
# Calculate category scores for each transaction
# For each category, sum up the scores of all matching keywords
for cat in categories:
    all_df[cat] = all_df['tokens'].apply(
        lambda toks: sum(
            kw_lookup.get((t, cat), 0) for t in toks  # Lookup token-category score
        )
    )

In [None]:
# Remove temporary tokens column
all_df.drop(columns=['tokens'], inplace=True)

In [None]:
# Normalize scores by taking square root (reverse the squaring)
for cat in categories:
    all_df[cat] = np.sqrt(all_df[cat].fillna(0))

In [None]:
# Predict category as the one with highest score
all_df["predicted_category"] = (
    all_df[categories]
        .fillna(0)
        .idxmax(axis=1)
)

In [None]:
# Export results to Excel for further analysis
all_df.to_excel(
  "credit_category_score_v6.xlsx",
    index=False
)

In [None]:
# Calculate and display confusion matrix for each category
# Provides detailed performance metrics for multi-class classification

for ref_category in categories:

    # True Positives: Correct category predictions
    TP = (
        (all_df["Category"] == ref_category) &
        (all_df["predicted_category"] == ref_category)
    ).sum()

    # False Negatives: Missed predictions (should be category, predicted something else)
    FN = (
        (all_df["Category"] == ref_category) &
        (all_df["predicted_category"] != ref_category)
    ).sum()

    # False Positives: Incorrect predictions (predicted category, but actually something else)
    FP = (
        (all_df["Category"] != ref_category) &
        (all_df["predicted_category"] == ref_category)
    ).sum()

    # True Negatives: Correct predictions of other categories
    TN = (
        (all_df["Category"] != ref_category) &
        (all_df["predicted_category"] != ref_category)
    ).sum()

    print("Category:", ref_category)
    print("TP:", TP)
    print("FN:", FN)
    print("FP:", FP)
    print("TN:", TN)
    print("-" * 40)