In [1]:
import pandas as pd
import numpy as np
import re
from collections import Counter


In [None]:
input_file="credit_txn_v5.xlsx"
output_file = "keyword_freq_by_category_credit_v5.xlsx"

In [14]:
all_df = pd.read_excel(input_file)

In [15]:
def normalize(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [16]:
all_df["narr_norm"] = all_df["Narration"].apply(normalize)

In [17]:
categories = (
    all_df['Category']
    .dropna()
    .unique()
)


In [18]:
def keyword_freq_df(df, narr_col, category_name, min_freq=10):
    tokens = []
    for n in df[narr_col].dropna():
        tokens.extend(n.split())

    freq = Counter(tokens)

    category_txn_count = len(df)

    freq_df = (
        pd.DataFrame(freq.items(), columns=['keyword', 'freq'])
          .query('freq > @min_freq')
          .sort_values('freq', ascending=False)
          .reset_index(drop=True)
    )

    freq_df['category'] = category_name
    freq_df['no_of_transactions'] = category_txn_count

    return freq_df


In [20]:
all_freq_dfs = []

for category in categories:
    ref_df = all_df[
        all_df['Category']
        .str.strip().str.lower()
        == category.strip().lower()
    ]

    if ref_df.empty:
        continue

    freq_df = keyword_freq_df(
        ref_df,
        narr_col='narr_norm',
        category_name=category,
        min_freq=10
    )

    all_freq_dfs.append(freq_df)

final_df = pd.concat(all_freq_dfs, ignore_index=True)


final_df = (
    final_df
    .sort_values(
        by=['category', 'freq'],
        ascending=[True, False]
    )
    .reset_index(drop=True)
)


In [23]:
all_tokens = []
for n in all_df['narr_norm'].dropna():
    all_tokens.extend(n.split())

total_freq_counter = Counter(all_tokens)


In [24]:
final_df['freq_total_dataset'] = final_df['keyword'].map(
    lambda k: total_freq_counter.get(k, 0)
)


In [None]:
STOPWORDS = {
    'in','on','at','for','to','from','by','with','about','against',
    'between','into','through','during','before','after','above','below',
    'of','off','over','under','and','or','the','a','an','is','are',
    'this','that','these','those'
}

def is_numeric_like(word):
    return any(char.isdigit() for char in str(word))


In [None]:
df_clean = (
    final_df.assign(keyword=final_df['keyword'].astype(str).str.lower().str.strip())
      .loc[~final_df['keyword'].apply(is_numeric_like)]
      .loc[~final_df['keyword'].isin(STOPWORDS)]
      .loc[final_df["keyword"].str.len() > 2]
      .reset_index(drop=True)
)


In [25]:
df_clean.to_excel(
   output_file,
    index=False
)


In [14]:
txn_types = (
    all_df['Debit/Credit']
    .dropna()
    .str.strip()
    .str.lower()
    .unique()
)


In [15]:
def keyword_freq_by_txn_type(df, narr_col, txn_col, txn_type, min_freq=10):
    ref_df = df[
        df[txn_col].str.strip().str.lower() == txn_type
    ]

    if ref_df.empty:
        return None

    tokens = []
    for n in ref_df[narr_col].dropna():
        tokens.extend(n.split())

    freq = Counter(tokens)

    freq_df = (
        pd.DataFrame(freq.items(), columns=['keyword', 'freq'])
          .query('freq > @min_freq')
          .sort_values('freq', ascending=False)
          .reset_index(drop=True)
    )

    freq_df['transaction_type'] = txn_type

    return freq_df


In [16]:
all_tokens = []
for n in all_df['narr_norm'].dropna():
    all_tokens.extend(n.split())

global_freq_counter = Counter(all_tokens)


In [17]:
all_txn_freq_dfs = []

for txn_type in txn_types:
    freq_df = keyword_freq_by_txn_type(
        df=all_df,
        narr_col='narr_norm',
        txn_col='Debit/Credit',
        txn_type=txn_type,
        min_freq=10
    )

    if freq_df is not None:
        all_txn_freq_dfs.append(freq_df)


In [18]:
final_txn_df = (
    pd.concat(all_txn_freq_dfs, ignore_index=True)
      .sort_values(
          by=['transaction_type', 'freq'],
          ascending=[True, False]
      )
      .reset_index(drop=True)
)


In [19]:
final_txn_df['freq_total_dataset'] = final_txn_df['keyword'].map(
    lambda k: global_freq_counter.get(k, 0)
)

In [20]:
credit_df = final_txn_df[
    final_txn_df['transaction_type'] == 'credit'
]

debit_df = final_txn_df[
    final_txn_df['transaction_type'] == 'debit'
]


In [22]:
credit_df = credit_df.drop(columns=['transaction_type'])
debit_df = debit_df.drop(columns=['transaction_type'])


In [23]:
output_file = "keyword_frequency_by_transaction_type.xlsx"

with pd.ExcelWriter(output_file, engine="openpyxl") as writer:
    credit_df.to_excel(
        writer,
        sheet_name="credit",
        index=False
    )
    debit_df.to_excel(
        writer,
        sheet_name="debit",
        index=False
    )
