In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter


In [2]:
all_df = pd.read_excel("credit_txn_v6.xlsx")

In [3]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
all_df["narr_norm"] = all_df["Narration"].apply(normalize)


In [5]:
kw_df = pd.read_excel("keyword_freq_by_category_credit_v5.xlsx")

In [6]:
kw_df['keyword'] = kw_df['keyword'].str.lower().str.strip()

In [7]:
kw_df['score_sq'] = kw_df['score'] ** 2

In [8]:
kw_lookup = (
    kw_df
    .set_index(['keyword', 'category'])['score_sq']
    .to_dict()
)


In [9]:
all_df['tokens'] = (
    all_df['narr_norm']
    .fillna('')
    .str.lower()
    .str.split()
)


In [10]:
categories = kw_df['category'].unique()

In [11]:
for cat in categories:
    all_df[cat] = all_df['tokens'].apply(
        lambda toks: sum(
            kw_lookup.get((t, cat), 0) for t in toks
        )
    )


In [12]:
all_df.drop(columns=['tokens'], inplace=True)


In [13]:
for cat in categories:
    all_df[cat] = np.sqrt(all_df[cat].fillna(0))


In [14]:
all_df["predicted_category"] = (
    all_df[categories]
        .fillna(0)
        .idxmax(axis=1)
)


In [15]:
all_df.to_excel(
  "credit_category_score_v6.xlsx",
    index=False
)


In [17]:
for ref_category in categories:

    TP = (
        (all_df["Category"] == ref_category) &
        (all_df["predicted_category"] == ref_category)
    ).sum()

    FN = (
        (all_df["Category"] == ref_category) &
        (all_df["predicted_category"] != ref_category)
    ).sum()

    FP = (
        (all_df["Category"] != ref_category) &
        (all_df["predicted_category"] == ref_category)
    ).sum()

    TN = (
        (all_df["Category"] != ref_category) &
        (all_df["predicted_category"] != ref_category)
    ).sum()

    print("Category:", ref_category)
    print("TP:", TP)
    print("FN:", FN)
    print("FP:", FP)
    print("TN:", TN)
    print("-" * 40)
