In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import ahocorasick

In [2]:
df=pd.read_excel("debit_txn_v5.xlsx")


In [3]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
df["narr_norm"] = df["Narration"].apply(normalize)


In [5]:
kw_df = pd.read_excel("clean_keyword_frequency_by_category_debit.xlsx")


In [6]:
df['tokens'] = (
    df['narr_norm']
    .fillna('')
    .str.lower()
    .str.split()
)


In [7]:
df["narr_str"] = (
    df["narr_norm"]
    .str.lower()
    .str.replace(" ", "", regex=False)
)

kw_df = kw_df.dropna(subset=["keyword"])

kw_df["keyword_str"] = (
    kw_df["keyword"]
    .astype(str)
    .str.lower()
    .str.replace(" ", "", regex=False)
)


In [8]:
categories = kw_df['category'].unique()

In [9]:
kw_df['score_sq'] = kw_df['score'] ** 2

In [10]:
short_kw_mask = kw_df['keyword'].str.len() < 4
kw_lookup_short = (
    kw_df[short_kw_mask]
    .set_index(['keyword', 'category'])['score_sq']
    .to_dict()
)


long_kw_df = kw_df[~short_kw_mask]

def build_long_category_automaton(df_long):
    automaton = ahocorasick.Automaton()
    for _, row in df_long.iterrows():
        kw = str(row["keyword"]).lower()
        cat = row["category"]
        score_sq = row["score"] ** 2
        automaton.add_word(kw, (cat, score_sq))
    automaton.make_automaton()
    return automaton

long_automaton = build_long_category_automaton(long_kw_df)

In [11]:
def token_check(tokens):
    scores = Counter()
    for t in tokens:
        for cat in categories:
            val = kw_lookup.get((t, cat), 0)
            if val > 0:
                scores[cat] += val
    return scores

In [12]:
def hybrid_score(row):
    scores = Counter()
    narr_norm = row["narr_norm"]
    tokens = row["tokens"]
    
    for t in tokens:
        for cat in categories:
            val = kw_lookup_short.get((t, cat), 0)
            if val > 0:
                scores[cat] += val
                
    for _, (cat, score_sq) in long_automaton.iter(narr_norm.lower()):
        scores[cat] += score_sq
        
    return scores

In [13]:
results = df.apply(hybrid_score, axis=1)
scores_df = pd.DataFrame(results.tolist()).fillna(0)
scores_df = np.sqrt(scores_df)

df = pd.concat([df, scores_df], axis=1)

In [14]:
df["predicted_category"] = scores_df.idxmax(axis=1)

In [15]:
(df['Category'].str.strip().str.lower()==df['predicted_category'].str.strip().str.lower()).sum()


np.int64(86960)

In [16]:
df["Category"].count()


np.int64(113484)