In [1]:
import pandas as pd

from flashtext import KeywordProcessor

import re

import os

import warnings

import random

warnings.filterwarnings("ignore")

In [2]:
credit_data_path = r"C:\Users\rohit\settleking_test\data\processed\credit_card.csv"

merchant_data_path = r"C:\Users\rohit\settleking_test\data\processed\merch_credit.csv"

In [3]:
credit_df = pd.read_csv(credit_data_path, parse_dates = ["Date"])

merchant_df = pd.read_csv(merchant_data_path)

In [4]:
credit_df.head(n = 5)

Unnamed: 0,Date,Amount,abs_Amount,Transaction_Type,Cleaned_Desc,Description,Day,Month,Year
0,2025-06-30,-11.02,217.67,1,pony mailbox and businessbellevue wa,PONY MAILBOX AND BUSINESSBELLEVUE WA,30,June,2025
1,2025-06-29,-150.0,7500.0,1,delta air upgrades seattle wa,DELTA AIR Upgrades SEATTLE WA,29,June,2025
2,2025-06-29,-6.33,3875.0,1,amazon mktpl amzn,AMAZON MKTPL*N35R76AA2 Amzn.com/billWA,29,June,2025
3,2025-06-29,-6.33,330.0,1,amazon mktpl amzn,AMAZON MKTPL*N34153A22 Amzn.com/billWA,29,June,2025
4,2025-06-29,-13.17,9975.0,1,amazon mktpl amzn,AMAZON MKTPL*N37EB3J32 Amzn.com/billWA,29,June,2025


In [5]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              212 non-null    datetime64[ns]
 1   Amount            212 non-null    float64       
 2   abs_Amount        99 non-null     float64       
 3   Transaction_Type  212 non-null    int64         
 4   Cleaned_Desc      212 non-null    object        
 5   Description       212 non-null    object        
 6   Day               212 non-null    int64         
 7   Month             212 non-null    object        
 8   Year              212 non-null    int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 15.0+ KB


In [6]:
credit_df.describe()

Unnamed: 0,Date,Amount,abs_Amount,Transaction_Type,Day,Year
count,212,212.0,99.0,212.0,212.0,212.0
mean,2025-05-20 03:37:21.509433856,-30.215377,6450.139495,0.933962,15.254717,2025.0
min,2025-04-01 00:00:00,-4445.25,1.25,0.0,1.0,2025.0
25%,2025-05-02 00:00:00,-109.5525,427.5,1.0,8.0,2025.0
50%,2025-05-20 00:00:00,-35.715,2500.0,1.0,15.0,2025.0
75%,2025-06-11 00:00:00,-10.9925,7250.0,1.0,24.0,2025.0
max,2025-06-30 00:00:00,3500.0,100000.0,1.0,31.0,2025.0
std,,623.218113,12328.191835,0.248936,9.260384,0.0


In [7]:
credit_df.columns

Index(['Date', 'Amount', 'abs_Amount', 'Transaction_Type', 'Cleaned_Desc',
       'Description', 'Day', 'Month', 'Year'],
      dtype='object')

In [8]:
print("Year : ",credit_df["Year"].unique()[0])

print("Months : ",credit_df["Month"].unique())

Year :  2025
Months :  ['June' 'May' 'April']


In [9]:
keyword_processor = KeywordProcessor(case_sensitive = False)

for _,row in merchant_df.iterrows():

    keyword = row["alias"]

    category = row["category"]

    keyword_processor.add_keyword(keyword,category)

In [10]:
def test_categorize(test):

    match = keyword_processor.extract_keywords(test)

    if match:

        match = list(dict.fromkeys(match))

        match = match + (["None"] * (3-len(match)))

        return match[:3]
    
    else:

        return ["Miscellaneous","None","None"]

In [11]:
# def categorize_multi(text):

#     match = keyword_processor.extract_keywords(text)

#     if match:

#         match = list(dict.fromkeys(match))
    
#         match_sorted = sorted(

#         match,
#         key=lambda x: priority.index(x) if x in priority else len(priority)
#         )
    
#         match_sorted += ["None"] * (3 - len(match_sorted))
    
#         return match_sorted[:3]
    
#     else:
        
#         return ["Miscellaneous","None","None"]

In [12]:
test_range = [random.randint(1,200) for _ in range(15)]

cc_test_desc = [credit_df["Cleaned_Desc"][number] for number in test_range]

cc_test_desc

['pony mailbox and businessbellevue wa',
 'ici bellevue wa ma',
 'uber eats help',
 'online payment thank you',
 'netflix netflix ca',
 'taco bell redmond wa',
 'codakid codakid az',
 'uber eats help',
 'sq ruchi inc bellevue wa',
 'water coffee delivery fl',
 'amazon mktplace pmts amzn',
 'gdp kids dentistry bellevue wa',
 'vfs services usa inc washington dcdc',
 'cabe annemasse fr',
 'wsdot online renton wa']

In [21]:
for desc in cc_test_desc:

    print(f"The description : {desc}\n")

    print(f"The tag : {test_categorize(desc)}\n")

    print("-------------------------------------------------\n")

The description : pony mailbox and businessbellevue wa

The tag : ['Miscellaneous', 'None', 'None']

-------------------------------------------------

The description : ici bellevue wa ma

The tag : ['Miscellaneous', 'None', 'None']

-------------------------------------------------

The description : uber eats help

The tag : ['Food Delivery', 'None', 'None']

-------------------------------------------------

The description : online payment thank you

The tag : ['Card Payment', 'None', 'None']

-------------------------------------------------

The description : netflix netflix ca

The tag : ['Entertainment', 'None', 'None']

-------------------------------------------------

The description : taco bell redmond wa

The tag : ['Food', 'None', 'None']

-------------------------------------------------

The description : codakid codakid az

The tag : ['Miscellaneous', 'None', 'None']

-------------------------------------------------

The description : uber eats help

The tag : ['Food

In [14]:
credit_df[["Category_1","Category_2","Category_3"]] = credit_df["Cleaned_Desc"].apply(test_categorize).apply(pd.Series)

In [15]:
credit_df[["Cleaned_Desc","Category_1","Category_2","Category_3"]]

Unnamed: 0,Cleaned_Desc,Category_1,Category_2,Category_3
0,pony mailbox and businessbellevue wa,Miscellaneous,,
1,delta air upgrades seattle wa,Travel,,
2,amazon mktpl amzn,Online Purchase,,
3,amazon mktpl amzn,Online Purchase,,
4,amazon mktpl amzn,Online Purchase,,
...,...,...,...,...
207,convenfee ma,Miscellaneous,,
208,amazon mktpl amzn,Online Purchase,,
209,city center plazabellevue wa,Retail,,
210,sor bellevue wa,Fees,,


In [16]:
grp = credit_df.groupby(by = ["Category_1"])["Cleaned_Desc"].count()

grp

Category_1
Card Payment       11
Entertainment      24
Fees                8
Food                3
Food Delivery      14
Health/Beauty       4
Healthcare          1
Insurance           6
Miscellaneous      97
Online Purchase    23
Online Pusrchse     4
Retail              5
Toll                3
Travel              1
Utility Payment     8
Name: Cleaned_Desc, dtype: int64

In [17]:
grp_2 = credit_df.groupby(by = ["Category_1"])["Cleaned_Desc"]

In [18]:
print(grp_2.get_group("Utility Payment"))

9                 comcast xfinity wa
41         puget sound energy inc wa
63          city of bellevue util wa
83        comcast xfinity comcast wa
117        puget sound energy inc wa
168    comcast cable comm comcast wa
192        puget sound energy inc wa
202         city of bellevue util wa
Name: Cleaned_Desc, dtype: object


In [19]:
misc_df = credit_df[credit_df["Category_1"] == "Miscellaneous"]

In [20]:
misc_df

Unnamed: 0,Date,Amount,abs_Amount,Transaction_Type,Cleaned_Desc,Description,Day,Month,Year,Category_1,Category_2,Category_3
0,2025-06-30,-11.02,217.67,1,pony mailbox and businessbellevue wa,PONY MAILBOX AND BUSINESSBELLEVUE WA,30,June,2025,Miscellaneous,,
17,2025-06-25,-107.75,2000.00,1,water coffee delivery fl,WATER COFFEE DELIVERY 800-7285508 FL,25,June,2025,Miscellaneous,,
19,2025-06-25,-2745.00,1000.00,1,in the ferncliff law wa,IN *THE FERNCLIFF LAW OFF872-2165463 WA,25,June,2025,Miscellaneous,,
22,2025-06-25,-9.99,1000.00,1,uber one help,UBER *ONE HELP.UBER.COMCA,25,June,2025,Miscellaneous,,
23,2025-06-24,-13.21,6715.00,1,kindle unltd wa,Kindle Unltd*NO0VG5I00 888-802-3080 WA,24,June,2025,Miscellaneous,,
...,...,...,...,...,...,...,...,...,...,...,...,...
195,2025-04-10,-72.59,,1,water coffee delivery fl,WATER COFFEE DELIVERY 800-7285508 FL,10,April,2025,Miscellaneous,,
197,2025-04-09,-16.00,,1,sea airport parking wa,SEA AIRPORT PARKING 2067874069 WA,9,April,2025,Miscellaneous,,
201,2025-04-04,-18.28,,1,ici bellevue wa ma,ICI*FEE BELLEVUE WA 866-342-9267 MA,4,April,2025,Miscellaneous,,
205,2025-04-02,-114.29,,1,sq joey bellevue bellevue wa,SQ *JOEY BELLEVUE Bellevue WA,2,April,2025,Miscellaneous,,
