In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from enum import Enum
import os
import glob
import pdb
import warnings
from utils import *
warnings.filterwarnings('ignore')

TODO: Write a script to combine new finance data with old data

In [4]:
INPUT_DATA_DIR = 'data'
OUTPUT_DATA_DIR = 'output'

## TODO: Add emoji after each enum text to visually identify fast

# * Don't have _notes.csv. Instead keep only one copy.
# * Keep original name of files with account number mentioned
# * For each csv, write summary of earliest & last date so as to ensure if all data loaded

class AccHolder(Enum):
    SACHIN = 'sachin'
    NOWRIN = 'nowrin'
    JOINT = 'joint'

class BankName(Enum):
    PNC = 'pnc'
    DISCOVER = 'discover'
    CHASE = 'chase'
    
class AccType(Enum):
    CHECKING = 'checking'
    CREDIT = 'credit'
    SAVINGS = 'savings'

In [11]:
CHASE_ACC_MAP = {
    AccHolder.SACHIN: {
        AccType.CHECKING: "5231",
        AccType.SAVINGS: "1756"
    },
    AccHolder.NOWRIN: {
        AccType.CHECKING: "7138",
        AccType.SAVINGS: "2267",
        AccType.CREDIT: ["6562", "7345"]
    },
    AccHolder.JOINT: {
        AccType.CHECKING: "7386",
        AccType.CREDIT: "6611"
    }
}

In [92]:
class Bank:
    def __init__(self, owner, acc_type, bank_name):
        self.owner = owner
        self.acc_type = acc_type
        self.bank_name = bank_name
        self.data = None

    def is_data_loaded(self):
        return not self.data is None

    def get_data(self):
        return self.data

    def load_data(self):
        pass

    def clean_data(self):
        data = self.data
        data['Date'] = pd.to_datetime(data['Date'])
        data.rename(columns={"Category": "AutoCategory"}, inplace=True)
        data['AccHolder'] = self.owner.value
        data['BankName'] = self.bank_name.value
        data['AccType'] = self.acc_type.value

        interested_columns = ['Date', 'Description', 'Amount', 'AutoCategory', 'AccHolder',
                            'BankName', 'AccType']
        data = data[interested_columns]
        self.data = data

    def summary(self):
        if self.data is None:
            print("Error: Data not loaded")
            return
        no_of_entries = len(self.data)
        oldest_date = self.data['Date'].dt.date.min()
        newest_date = self.data['Date'].dt.date.max()

        print(f'{self.bank_name.value.upper()} | {self.owner.value.upper()} | {self.acc_type.value.upper()}')
        print(f'{oldest_date} -> {newest_date}')
        print('Total Entries:', no_of_entries)
        print()
        # display(chase.data.head())

class Chase(Bank):
    def __init__(self, owner, acc_type):
        super().__init__(owner, acc_type, BankName.CHASE)

    def load_data(self):
        code = None
        if self.acc_type in CHASE_ACC_MAP[self.owner]:
            code = CHASE_ACC_MAP[self.owner][self.acc_type]
        if not code: return None
        if isinstance(code, str):
            code = [code]

        dataframes = []
        for cd in code:
            filename = f'Chase{cd}_Activity*.CSV'
            matched_files = glob.glob(f'{INPUT_DATA_DIR}/{filename}')
            if len(matched_files) == 0:
                print(f"Error: File missing for code: {cd}")
            elif len(matched_files) > 1:
                print(f"Error: Multiple files for code: {cd}")
            df = pd.read_csv(matched_files[0], index_col=False)
            dataframes.append(df)

        self.data = pd.concat(dataframes, ignore_index=True)

    def clean_data(self):
        data = self.data
        if data is None: return
        if self.acc_type == AccType.CREDIT:
            data['Date'] = data['Transaction Date']
        else:
            data['Date'] = data['Posting Date']
            data['Category'] = ''

        super().clean_data()

class Discover(Bank):
    def load_data(self):
        return "Meow!"

# chase = Chase(owner=AccHolder.SACHIN, acc_type=AccType.CHECKING)

chase = Chase(owner=AccHolder.NOWRIN, acc_type=AccType.CREDIT)
chase.load_data()
chase.clean_data()
chase.summary()

CHASE | NOWRIN | CREDIT
2023-01-06 -> 2023-08-17
Total Entries: 94



In [22]:
def preprocess_data(filename):
    account_holder, bank_name, account_type, *_ = filename.split('_')
    try:
        account_holder, bank_name, account_type = AccHolder(account_holder), BankName(bank_name), AccType(account_type)
    except(ValueError):
        print("Invalid filename:", filename)

    data = pd.read_csv(f'{INPUT_DATA_DIR}/{filename}', index_col=False)

    if bank_name == BankName.PNC:
        data['Amount'] = 0.0
        data['Amount'] -= convert_currency_to_num(data['Withdrawals'])
        data['Amount'] += convert_currency_to_num(data['Deposits'])
    if bank_name == BankName.CHASE and account_type == AccType.CHECKING:
        data['Date'] = data['Posting Date']
        data['Category'] = ''
    if bank_name == BankName.CHASE:
        if account_type == AccType.CREDIT:
            data['Date'] = data['Transaction Date']
        else:
            data['Date'] = data['Posting Date']
            data['Category'] = ''
    if bank_name == BankName.DISCOVER:
        data['Amount'] *= -1
        data['Date'] = data['Trans. Date']

    data['Date'] = pd.to_datetime(data['Date'])
    data.rename(columns={"Category": "AutoCategory"}, inplace=True)
    data['AccHolder'] = account_holder.value
    data['BankName'] = bank_name.value
    data['AccType'] = account_type.value

    interested_columns = ['Date', 'Description', 'Amount', 'AutoCategory', 'AccHolder',
                          'BankName', 'AccType']
    data = data[interested_columns]
    return data


In [94]:
def get_obj(owner, bank, acc_type):
    if bank == BankName.CHASE:
        # print(f'xxx {bank.value.upper()} | {owner.value.upper()} | {acc_type.value.upper()}')
        return Chase(owner=owner, acc_type=acc_type)

def merge_all_data():
    data_list = []
    for owner in AccHolder:
        for bank in BankName:
            for acc_type in AccType:
                obj = get_obj(owner, bank, acc_type)
                if not obj: continue
                obj.load_data()
                if not obj.is_data_loaded(): continue
                obj.clean_data()
                obj.summary()
                data_list.append(obj.get_data())

    full_data = pd.concat(data_list)
    full_data = full_data.sort_values(by=['Date', 'Amount'])
    full_data['AutoCategory'].fillna('', inplace=True)
    return full_data

data = merge_all_data()
data.head()

CHASE | SACHIN | CHECKING
2022-08-23 -> 2023-08-15
Total Entries: 107

CHASE | SACHIN | SAVINGS
2022-08-23 -> 2023-08-01
Total Entries: 11

CHASE | NOWRIN | CHECKING
2022-08-23 -> 2023-08-22
Total Entries: 130

CHASE | NOWRIN | CREDIT
2023-01-06 -> 2023-08-17
Total Entries: 94

CHASE | NOWRIN | SAVINGS
2022-08-23 -> 2023-08-15
Total Entries: 27

CHASE | JOINT | CHECKING
2023-03-20 -> 2023-08-22
Total Entries: 104

CHASE | JOINT | CREDIT
2023-03-25 -> 2023-08-20
Total Entries: 269



Unnamed: 0,Date,Description,Amount,AutoCategory,AccHolder,BankName,AccType
106,2022-08-23,DEPOSIT ID NUMBER 769345,25.0,,sachin,chase,checking
10,2022-08-23,DEPOSIT ID NUMBER 769346,25.0,,sachin,chase,savings
129,2022-08-23,DEPOSIT ID NUMBER 769347,25.0,,nowrin,chase,checking
26,2022-08-23,DEPOSIT ID NUMBER 769348,25.0,,nowrin,chase,savings
105,2022-09-16,WITHDRAWAL 09/16,-0.02,,sachin,chase,checking


In [23]:
def _merge_all_data():
    data_list = []
    file_list = os.listdir(INPUT_DATA_DIR)
    file_list = [file for file in file_list if file.endswith('.csv')]
    for filename in file_list:
        data = preprocess_data(filename)
        data_list.append(data)
    full_data = pd.concat(data_list)
    full_data = full_data.sort_values(by=['Date', 'Amount'])
    full_data['AutoCategory'].fillna('', inplace=True)
    return full_data

In [68]:
data = merge_all_data()

Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: SACHIN
Value: sachin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: NOWRIN
Value: nowrin
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint
Name: JOINT
Value: joint


In [25]:
YEAR = 2023
MONTH = 6
data = filter_data_by_date(data, YEAR, MONTH)
data.to_csv(f'{OUTPUT_DATA_DIR}/{YEAR}_{MONTH}.csv', index=False)

In [26]:
data.head(20)

Unnamed: 0,Date,Description,Amount,AutoCategory,AccHolder,BankName,AccType
55,2023-06-01,KROGER 605,-23.8,Groceries,joint,chase,credit
53,2023-06-01,DD DOORDASH MARCOSPIZ,-15.9,Food & Drink,joint,chase,credit
46,2023-06-01,TST* Jojos Shake Bar - D,-15.88,Food & Drink,joint,chase,credit
49,2023-06-01,PATEL BROTHERS OF ANN ARB,-12.25,Groceries,joint,chase,credit
56,2023-06-01,KROGER 605,-4.69,Groceries,joint,chase,credit
57,2023-06-01,TST* PICASSO NCRC,-2.99,Food & Drink,joint,chase,credit
51,2023-06-01,MARKET@WORK 2067379149,-2.29,Food & Drink,joint,chase,credit
26,2023-06-02,Zelle payment to Smit Kothari 17521925843,-606.72,,joint,chase,checking
27,2023-06-02,Online Transfer to CHK ...5231 transaction#: 1...,-50.0,,joint,chase,checking
54,2023-06-02,DOORDASH DASHPASS,-4.99,Food & Drink,joint,chase,credit
