In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter
import ahocorasick
import Levenshtein
from joblib import Parallel, delayed
from tqdm.notebook import tqdm

In [2]:
INPUT_FILE = "debit_txn_v5.xlsx"
KEYWORD_FILE = "clean_keyword_frequency_by_category_debit.xlsx"
OUTPUT_FILE = "hybrid_approach.xlsx"

In [3]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    return " ".join(text.split())


In [4]:
def is_edit_distance_one_or_less(kw, text):
    k = len(kw)
    t = len(text)
    for window_len in (k - 1, k, k + 1):
        if window_len <= 0: continue
        for i in range(0, t - window_len + 1):
            sub = text[i:i + window_len]
            if Levenshtein.distance(kw, sub) <= 1:
                return True
    return False


In [5]:
def score_transaction(narr, automaton, fuzzy_keywords):
    scores = Counter()
    
    found_exact = False
    for _, (cat, score_sq) in automaton.iter(narr):
        scores[cat] += score_sq
        found_exact = True
    
    if not found_exact:
        for kw, cat, score_sq in fuzzy_keywords:
            if is_edit_distance_one_or_less(kw, narr):
                scores[cat] += score_sq
                break 
                
    return scores

In [6]:
def process_chunk(chunk_series, automaton, fuzzy_keywords):
    return [score_transaction(n, automaton, fuzzy_keywords) for n in chunk_series]

In [None]:
print("Loading data...")
df = pd.read_excel(INPUT_FILE)
kw_df = pd.read_excel(KEYWORD_FILE).dropna(subset=["keyword"])

print("Normalizing transactions...")
df["narr_norm"] = df["Narration"].apply(normalize)

Loading data...


In [None]:
print("Building lookup structures...")
automaton = ahocorasick.Automaton()
fuzzy_keywords = []

for _, row in kw_df.iterrows():
    kw = str(row["keyword"]).lower().strip()
    cat = row["category"]
    score_sq = row["score"]**2
    
    automaton.add_word(kw, (cat, score_sq))
    if len(kw) > 5:
        fuzzy_keywords.append((kw, cat, score_sq))

automaton.make_automaton()
print(f"Setup complete. Fuzzy keywords to check: {len(fuzzy_keywords)}")

In [None]:
n_chunks = 100
chunks = np.array_split(df["narr_norm"], n_chunks)

print("Starting processing (this should take ~60-90 minutes)...")
results_nested = Parallel(n_jobs=-1, backend="loky")(
    delayed(process_chunk)(chunk, automaton, fuzzy_keywords) 
    for chunk in tqdm(chunks, desc="Processing Batches")
)

In [None]:
results = [item for sublist in results_nested for item in sublist]
scores_df = pd.DataFrame(results).fillna(0)
scores_df = np.sqrt(scores_df)

# Merge and identify winners
df_final = pd.concat([df, scores_df], axis=1)
df_final["predicted_category"] = scores_df.idxmax(axis=1)

In [None]:

(df_final['Category'].str.strip().str.lower()==df_final['predicted_category'].str.strip().str.lower()).sum()


In [None]:

# df_final.to_excel(OUTPUT_FILE, index=False)