In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter


In [2]:
df=pd.read_excel("debit_txn_v5.xlsx")


In [3]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
df["narr_norm"] = df["Narration"].apply(normalize)


In [5]:
kw_df = pd.read_excel("clean_keyword_frequency_by_category_debit.xlsx")


In [6]:
df["narr_str"] = (
    df["narr_norm"]
    .str.lower()
    .str.replace(" ", "", regex=False)
)

kw_df = kw_df.dropna(subset=["keyword"])

kw_df["keyword_str"] = (
    kw_df["keyword"]
    .astype(str)
    .str.lower()
    .str.replace(" ", "", regex=False)
)


In [7]:
categories = kw_df["category"].unique()

In [8]:
import ahocorasick
from collections import Counter
import time

def build_category_automaton(kw_df):
    automaton = ahocorasick.Automaton()

    for _, row in kw_df.iterrows():
        kw = row["keyword"]
        if not kw:
            continue
        cat = row["category"]
        score_sq = row["score"] ** 2
        automaton.add_word(kw.lower(), (cat, score_sq))

    automaton.make_automaton()
    return automaton

def category_score_map_aho(narr_str, automaton, row_idx=None):
    scores = Counter()
    narr_lower = narr_str.lower()

    for _, (cat, score_sq) in automaton.iter(narr_lower):
        scores[cat] += score_sq

    return scores

automaton = build_category_automaton(kw_df)

results = []
for idx, narr in enumerate(df["narr_norm"]):
    res = category_score_map_aho(narr, automaton, row_idx=idx)
    results.append(res)

scores_df = pd.DataFrame(results)
scores_df = scores_df.fillna(0)
scores_df = np.sqrt(scores_df)

df = pd.concat([df, scores_df], axis=1)

In [9]:
df["predicted_category"] = scores_df.idxmax(axis=1)

In [10]:
(df['Category'].str.strip().str.lower()==df['predicted_category'].str.strip().str.lower()).sum()


np.int64(78353)

In [11]:
df["Category"].count()


np.int64(113484)