# Keyword Matching in Narration Strings (Aho-Corasick)

This notebook implements string-based keyword matching using the Aho-Corasick algorithm to categorize transactions. It searches for keyword patterns directly in normalized narration strings and assigns category scores based on matches.

## Approach
- Uses Aho-Corasick automaton for efficient multi-pattern string matching
- Removes spaces from narrations and keywords for continuous matching
- Aggregates category scores from all matching keywords
- Calculates prediction accuracy by comparing with actual categories

In [None]:
# Import required libraries
import pandas as pd  # Data manipulation
import numpy as np   # Numerical operations
import re            # Text processing
from collections import Counter  # Count matches

In [None]:
# Load transaction data
df = pd.read_excel("debit_txn_v5.xlsx")

In [None]:
def normalize(text):
    """Normalize narration text for consistent matching"""
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
# Apply normalization to narrations
df["narr_norm"] = df["Narration"].apply(normalize)

In [None]:
# Load keyword data with category information
kw_df = pd.read_excel("clean_keyword_frequency_by_category_debit.xlsx")

In [None]:
# Prepare narration strings without spaces for continuous substring matching
df["narr_str"] = (
    df["narr_norm"]
    .str.lower()
    .str.replace(" ", "", regex=False)  # Remove spaces for continuous matching
)

# Prepare keyword strings without spaces
kw_df = kw_df.dropna(subset=["keyword"])

kw_df["keyword_str"] = (
    kw_df["keyword"]
    .astype(str)
    .str.lower()
    .str.replace(" ", "", regex=False)  # Normalize keyword strings
)

In [None]:
# Extract unique categories
categories = kw_df["category"].unique()

In [None]:
# Implement Aho-Corasick based string matching for category prediction
import ahocorasick
from collections import Counter
import time

def build_category_automaton(kw_df):
    """
    Build Aho-Corasick automaton from keyword dataframe.
    
    Args:
        kw_df (DataFrame): Keywords with categories and scores
        
    Returns:
        ahocorasick.Automaton: Compiled automaton for matching
    """
    automaton = ahocorasick.Automaton()

    for _, row in kw_df.iterrows():
        kw = row["keyword"]
        if not kw:
            continue
        cat = row["category"]
        score_sq = row["score"] ** 2  # Square scores for emphasis
        automaton.add_word(kw.lower(), (cat, score_sq))

    automaton.make_automaton()
    return automaton

def category_score_map_aho(narr_str, automaton, row_idx=None):
    """
    Calculate category scores using Aho-Corasick matching.
    
    Args:
        narr_str (str): Narration string to search in
        automaton: Compiled Aho-Corasick automaton
        row_idx (int): Row index (optional, for tracking)
        
    Returns:
        Counter: Category scores aggregated from matches
    """
    scores = Counter()
    narr_lower = narr_str.lower()

    # Find all matching keywords
    for _, (cat, score_sq) in automaton.iter(narr_lower):
        scores[cat] += score_sq

    return scores

# Build automaton from keyword data
automaton = build_category_automaton(kw_df)

# Apply Aho-Corasick scoring to all transactions
results = []
for idx, narr in enumerate(df["narr_norm"]):
    res = category_score_map_aho(narr, automaton, row_idx=idx)
    results.append(res)

# Convert to dataframe and normalize scores
scores_df = pd.DataFrame(results)
scores_df = scores_df.fillna(0)
scores_df = np.sqrt(scores_df)  # Reverse the squaring

# Concatenate with original dataframe
df = pd.concat([df, scores_df], axis=1)

In [None]:
# Predict category as the one with highest score
df["predicted_category"] = scores_df.idxmax(axis=1)

In [None]:
# Calculate accuracy: count correct predictions (case-insensitive)
(df['Category'].str.strip().str.lower() == df['predicted_category'].str.strip().str.lower()).sum()

np.int64(78353)

In [None]:
# Count total transactions analyzed
df["Category"].count()

np.int64(113484)