In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter


In [None]:
input_file="credit_txn_v5.xlsx"
output_file = "output.xlsx"

In [None]:
all_df = pd.read_excel(input_file)

In [None]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [None]:
txn_types = (
    all_df['Debit/Credit']
    .dropna()
    .str.strip()
    .str.lower()
    .unique()
)

In [None]:
def keyword_freq_by_txn_type(df, narr_col, txn_col, txn_type, min_freq=10):
    ref_df = df[
        df[txn_col].str.strip().str.lower() == txn_type
    ]

    if ref_df.empty:
        return None

    tokens = []
    for n in ref_df[narr_col].dropna():
        tokens.extend(n.split())

    freq = Counter(tokens)

    freq_df = (
        pd.DataFrame(freq.items(), columns=['keyword', 'freq'])
          .query('freq > @min_freq')
          .sort_values('freq', ascending=False)
          .reset_index(drop=True)
    )

    freq_df['transaction_type'] = txn_type

    return freq_df


In [None]:
all_tokens = []
for n in all_df['narr_norm'].dropna():
    all_tokens.extend(n.split())

global_freq_counter = Counter(all_tokens)


In [None]:
all_txn_freq_dfs = []

for txn_type in txn_types:
    freq_df = keyword_freq_by_txn_type(
        df=all_df,
        narr_col='narr_norm',
        txn_col='Debit/Credit',
        txn_type=txn_type,
        min_freq=10
    )

    if freq_df is not None:
        all_txn_freq_dfs.append(freq_df)


In [None]:
final_txn_df = (
    pd.concat(all_txn_freq_dfs, ignore_index=True)
      .sort_values(
          by=['transaction_type', 'freq'],
          ascending=[True, False]
      )
      .reset_index(drop=True)
)


In [None]:
final_txn_df['freq_total_dataset'] = final_txn_df['keyword'].map(
    lambda k: global_freq_counter.get(k, 0)
)

In [None]:
credit_df = final_txn_df[
    final_txn_df['transaction_type'] == 'credit'
]

debit_df = final_txn_df[
    final_txn_df['transaction_type'] == 'debit'
]


In [None]:
credit_df = credit_df.drop(columns=['transaction_type'])
debit_df = debit_df.drop(columns=['transaction_type'])


In [None]:
output_file = "keyword_frequency_by_transaction_type.xlsx"

with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    credit_df.to_excel(
        writer,
        sheet_name="credit",
        index=False
    )
    debit_df.to_excel(
        writer,
        sheet_name="debit",
        index=False
    )
