In [12]:
import pandas as pd

import re

import os

import warnings

warnings.filterwarnings("ignore")

In [13]:
def clean_desc(text):

    # Convert description to lower case

    text = text.lower()

    # Remove words with numbers, not starting with digit

    text = re.sub(r"\b(?!\d)\w*\d\w*\b", "", text)

    # Remove standalone numbers

    text = re.sub(r"\b\d+\b", "", text)

    # Remove words starting with special chars

    text = re.sub(r"\b[^\w\s]\S*", "", text)

    # Remove leftover special chars (not word chars, not spaces)

    text = re.sub(r"[^\w\s]", "", text)

    # Collapse multiple spaces

    return re.sub(r"\s+", " ", text).strip()



In [14]:
account_data_path = r"C:\Users\rohit\settleking_test\data\raw\account.csv"

credircard_data_path = r"C:\Users\rohit\settleking_test\data\raw\creditcard.csv"

In [15]:
account_df = pd.read_csv(account_data_path)

creditcard_df = pd.read_csv(credircard_data_path)

In [16]:
account_df.head(n = 5)

Unnamed: 0,Date,Amount,Cheque_Number,Description,Transaction_Type
0,2024-07-24,-217.67,,STATE FARM RO 27 SFPP 15 S 1074398715 SV,0
1,2024-07-24,-7500.0,,ONLINE TRANSFER REF #IB0NYCH2M4 TO VISA SIGNAT...,0
2,2024-07-24,-3875.0,,ONLINE TRANSFER TO 9LLABS LLC REF #IB0NYCGZRL ...,0
3,2024-07-18,-330.0,127.0,DEPOSITED OR CASHED CHECK # 127,0
4,2024-07-08,-9975.0,,ONLINE TRANSFER TO 9LLABS LLC REF #IB0NSKMG2J ...,0


In [17]:
creditcard_df.head(n = 5)

Unnamed: 0,Date,Amount,Description,Transaction_Type
0,2025-06-30,-11.02,PONY MAILBOX AND BUSINESSBELLEVUE WA,1
1,2025-06-29,-150.0,DELTA AIR Upgrades SEATTLE WA,1
2,2025-06-29,-6.33,AMAZON MKTPL*N35R76AA2 Amzn.com/billWA,1
3,2025-06-29,-6.33,AMAZON MKTPL*N34153A22 Amzn.com/billWA,1
4,2025-06-29,-13.17,AMAZON MKTPL*N37EB3J32 Amzn.com/billWA,1


In [None]:
account_df["Date"] = pd.to_datetime(account_df["Date"], errors = "coerce")

creditcard_df["Date"] = pd.to_datetime(creditcard_df["Date"], errors = "coerce")

In [19]:
account_df["Month"] = account_df["Date"].dt.month_name()

account_df["Year"] = account_df["Date"].dt.year

account_df["Day"] = account_df["Date"].dt.day

In [20]:
creditcard_df["Month"] = creditcard_df["Date"].dt.month_name()

creditcard_df["Year"] = creditcard_df["Date"].dt.year

creditcard_df["Day"] = creditcard_df["Date"].dt.day

In [21]:
account_df["Cheque_Number"] = account_df["Cheque_Number"].astype("Int64")

In [22]:
account_df["abs_Amount"] = account_df["Amount"].abs()

creditcard_df["abs_Amount"] = account_df["Amount"].abs()

In [23]:
account_df["Cleaned_Desc"] = account_df["Description"].map(lambda x: clean_desc(x))

creditcard_df["Cleaned_Desc"] = creditcard_df["Description"].map(lambda x: clean_desc(x))

In [24]:
# To arrange columns in an order

account_df = account_df[["Date","Amount","abs_Amount","Cheque_Number","Transaction_Type","Cleaned_Desc","Description","Day","Month","Year"]]

creditcard_df = creditcard_df[["Date","Amount","abs_Amount","Transaction_Type","Cleaned_Desc","Description","Day","Month","Year"]]

In [25]:
account_df.head(n = 5)

Unnamed: 0,Date,Amount,abs_Amount,Cheque_Number,Transaction_Type,Cleaned_Desc,Description,Day,Month,Year
0,2024-07-24,-217.67,217.67,,0,state farm ro sfpp s sv,STATE FARM RO 27 SFPP 15 S 1074398715 SV,24,July,2024
1,2024-07-24,-7500.0,7500.0,,0,online transfer ref to visa signature card on,ONLINE TRANSFER REF #IB0NYCH2M4 TO VISA SIGNAT...,24,July,2024
2,2024-07-24,-3875.0,3875.0,,0,online transfer to 9llabs llc ref business che...,ONLINE TRANSFER TO 9LLABS LLC REF #IB0NYCGZRL ...,24,July,2024
3,2024-07-18,-330.0,330.0,127.0,0,deposited or cashed check,DEPOSITED OR CASHED CHECK # 127,18,July,2024
4,2024-07-08,-9975.0,9975.0,,0,online transfer to 9llabs llc ref business che...,ONLINE TRANSFER TO 9LLABS LLC REF #IB0NSKMG2J ...,8,July,2024


In [26]:
creditcard_df.head(n = 5)

Unnamed: 0,Date,Amount,abs_Amount,Transaction_Type,Cleaned_Desc,Description,Day,Month,Year
0,2025-06-30,-11.02,217.67,1,pony mailbox and businessbellevue wa,PONY MAILBOX AND BUSINESSBELLEVUE WA,30,June,2025
1,2025-06-29,-150.0,7500.0,1,delta air upgrades seattle wa,DELTA AIR Upgrades SEATTLE WA,29,June,2025
2,2025-06-29,-6.33,3875.0,1,amazon mktpl amzn,AMAZON MKTPL*N35R76AA2 Amzn.com/billWA,29,June,2025
3,2025-06-29,-6.33,330.0,1,amazon mktpl amzn,AMAZON MKTPL*N34153A22 Amzn.com/billWA,29,June,2025
4,2025-06-29,-13.17,9975.0,1,amazon mktpl amzn,AMAZON MKTPL*N37EB3J32 Amzn.com/billWA,29,June,2025


In [27]:
os.makedirs("../data/processed", exist_ok = True)

In [28]:
account_df.to_csv("../data/processed/account_activity.csv",index = False,encoding = "utf-8",date_format = "%Y-%m-%d")

creditcard_df.to_csv("../data/processed/credit_card.csv",index = False,encoding = "utf-8",date_format = "%Y-%m-%d")