In [1]:
import pandas as pd
from transaction_keywords import expense_keywords, income_keywords

In [None]:
# TODO Include SLM for unknown transaction
def categorize_transactions(df):

    # Preprocess the 'Name / Description' column to remove special characters
    df['Name'] = df['Name'].str.replace('*', ' ', regex=False).str.replace('/', ' ', regex=False)

    def assign_category(row):
        description = row["Name"]
        transaction_type = row['IsCredit'].strip().lower()
        if transaction_type == 'debit':
            for category, keywords in expense_keywords.items():
                if any(keyword.lower() in description.lower() for keyword in keywords):
                    return category
            return 'Uncategorized Expense'
        elif transaction_type == 'credit':
            for category, keywords in income_keywords.items():
                if any(keyword.lower() in description.lower() for keyword in keywords):
                    return category
            return 'Uncategorized Income'
        return 'Uncategorized'

    # Apply categorization
    df['Category'] = df.apply(assign_category, axis=1)
    return df

In [None]:
# Read the data
df_ing = pd.read_csv("../data/raw/ing_bank_statement_sample.csv")
df_revolut = pd.read_csv("../data/raw/revolut_bank_statement_sample.csv")


# Clean the data
## ING
""" Column information

    'Date',                -> Year/ Month/ Day
    'Name / Description',  -> Name
    'Account',             -> Remove
    'Counterparty',        -> Remove
    'Code',                -> Remove
    'Debit/credit',        -> Debit/Credit -> Positive/Negative
    'Amount (EUR)',        -> Keep
    'Transaction type',    -> Remove
    'Notifications'        -> Remove
"""
col_for_ING = ["Date", "Name / Description", "Debit/credit", "Amount (EUR)"]
col_rename_dict_ING = {
    "Name / Description": "Name",
    "Debit/credit": "IsCredit",
    "Amount (EUR)": "Amount"
}

# Extract only the needed columns, and rename them so that it can be merged with other dataframe
df_ing = (
    df_ing[col_for_ING]
    .rename(columns=col_rename_dict_ING)
)

## Revlout
""" Column information

    'Type',           -> Remove
    'Product',        -> Remove
    'Started Date',   -> Remove
    'Completed Date', -> Remove
    'Description',    -> Name
    'Amount',         -> Keep
    'Fee',            -> Remove
    'Currency',       -> Remove (So far only Euro)
    'State',          -> Remove
    'Balance'         -> Remove
"""


col_for_revolut = ["Description", "Amount", "Completed Date"]
col_rename_dict_Revlout = {
    "Description": "Name",
    "Completed Date": "Date"
}

df_revolut = (
    df_revolut[col_for_revolut]
    .rename(columns=col_rename_dict_Revlout)
)

## ING, Revlout (Check if the columns matched)

# Transform the data

## ING
### Date -> Year/ Month/ Day
df_ing["Date"] = pd.to_datetime(df_ing["Date"], format="%Y%m%d")
df_ing["Year"] = df_ing["Date"].dt.year
df_ing["Month"] = df_ing["Date"].dt.month
df_ing["Day"] = df_ing["Date"].dt.day
### Amount -> 100,00 => 100.00
df_ing["Amount"] = df_ing["Amount"].apply(
    lambda row: float(row.replace(",", ".")))
### Amount + isCredit -> Amount (Positive/Negative)
df_ing["Amount"] = df_ing.apply(
    lambda row: -row["Amount"] if row["IsCredit"] == "Debit" else row["Amount"],
    axis=1
)

### Create Source column -> Indiciate where the data is from
df_ing["Source"] = "ING"

## Revlout

### Date -> Year/ Month/ Day
df_revolut["Date"] = pd.to_datetime(df_revolut["Date"])
df_revolut["Year"] = df_revolut["Date"].dt.year.astype("Int32") # Int 32 allows  Na
df_revolut["Month"] = df_revolut["Date"].dt.month.astype("Int32")
df_revolut["Day"] = df_revolut["Date"].dt.day.astype("Int32")

### Create IsCredit column
df_revolut["IsCredit"] = df_revolut.apply(
    lambda row: "Debit" if row["Amount"] < 0 else "Credit",
    axis=1
    )

### Create Source column -> Indiciate where the data is from
df_revolut["Source"] = "Revolut"


# Reorder columns 
df_ing = df_ing[["Year", "Month", "Day", "Name", "Amount", "IsCredit", "Source"]]
df_revolut = df_revolut[["Year", "Month", "Day", "Name", "Amount", "IsCredit", "Source"]]


# Merge two data sources
df = pd.concat([df_ing, df_revolut], axis=0)

# Assign category
df = categorize_transactions(df)


# Output file
df.to_csv("../data/processed/cleaned_transactions.csv")
