In [224]:
import pandas as pd, numpy as np

In [225]:
import copy
from datetime import datetime, date

In [226]:
path = '../../../../MyDocuments/Logs/HSBC/'
df = pd.read_csv(path + 'DCTransactions.csv')

In [227]:
def process_transactions(df_param: pd.DataFrame) -> pd.DataFrame: 
    df_mod = copy.deepcopy(df_param)
    df_mod['Description'] = df_mod['Description'].str.replace('Description', '')
    df_mod['Amount'] = df_mod['Amount'].str.replace('Amount', '').str.replace(',', '')
    df_mod['Amount'] = pd.to_numeric(df_mod['Amount'])

    df_mod['Balance'] = df_mod['Balance'].str.replace('Balance', '')
    df_mod['Balance'] = df_mod['Balance'].str.replace(',', '')
    df_mod['Balance'] = df_mod['Balance'].str.replace(' not applicable', '')
    df_mod['Balance'] = pd.to_numeric(df_mod['Balance'])
#     df_mod['Balance'].fillna(method='bfill', inplace=True)
    df_mod.loc[df_mod['Amount'].notnull(), 'Balance'] = np.NaN
    df_mod['Date'] = df_mod['Date'].str.replace('Date', '')

    df_mod['TransactionDate'] = pd.to_datetime(
        df_mod['Date'].str[:2] 
        + '-' 
        + df_mod['Date'].str[3:6] 
        + '-' 
        + '20' + df_mod['Date'].str[-2:]
    )

    df_mod['Statement Date'].fillna(method='ffill', inplace=True)

    df_mod['StatementDate'] = pd.to_datetime(
        df_mod['Statement Date'].str[:2] 
        + '-' 
        + df_mod['Statement Date'].str[3:6] 
        + '-' 
        + '20' + df_mod['Statement Date'].str[-2:]
    )

    df_mod['Category'] = ""
    df_mod['Memo'] = ""
    df_mod['Outflow'] = df_mod[df_mod['Amount']<0]['Amount'] * -1
    df_mod['Inflow'] = df_mod[df_mod['Amount']>0]['Amount'] 

    df_mod.rename({'Description': 'Payee'}, axis = 1, inplace=True)
    cols = ['StatementDate', 'TransactionDate', 'Payee', 'Amount', 'Balance']
    df_mod = df_mod[cols]
    return df_mod

In [228]:
df_a = process_transactions(df)
df_a.head()

Unnamed: 0,StatementDate,TransactionDate,Payee,Amount,Balance
0,2020-03-25,2020-02-25,Opening balance this month,,54438.33
1,2020-03-25,2020-02-26,TESCO METRO 2265 CHEAPSIDELON,-2.15,
2,2020-03-25,2020-02-27,CASH SAINSBY FEB27Sainsburys B@18:16,-40.0,
3,2020-03-25,2020-02-28,ASDA SUPERSTORE ISLE OFDOGS,-1.98,
4,2020-03-25,2020-02-28,SAINSBURYS S/MKTS THE CITY-MAN,-2.0,


In [229]:
statements = df_a.loc[:,'StatementDate'].unique()

In [230]:
for statement in sorted(statements):
    out_path = path + 'Processed\\' + 'HSBC_DC_' + pd.to_datetime(statement).strftime('%Y%m%d') + '.csv'
    print('Printing to [{out}]'.format(out=out_path))
    df_a.loc[df_a['StatementDate'] == statement, :].to_csv(out_path, index = False)

Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20140825.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20140925.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20141025.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20141125.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20141225.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150125.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150225.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150325.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150425.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150525.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150625.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Processed\HSBC_DC_20150725.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/Proce

In [231]:
def transform_payee(in_str: str) -> (str, str):
    out_str = in_str
    category = ""
    mapping = [{"str": 'asda', 'repl': 'Asda', 'category': 'Basics: Groceries'}
               , {"str": 'netflix', 'repl': 'Netflix', 'category': 'Monthly: Streaming'}
               , {"str": 'amazonprime', 'repl': 'Amazon Prime', 'category': 'Monthly: Streaming'}
               , {"str": 'tesco', 'repl': 'Tesco', 'category': 'Basics: Groceries'}
               , {"str": 'sainsburys', 'repl': 'Sainsburys', 'category': 'Basics: Groceries'}
               , {"str": 'amazon.co.uk', 'repl': 'Amazon', 'category': 'Discretionary: Misc Purchases'}
               , {"str": 'ee limited', 'repl': 'EE Limited', 'category': 'Basics: Phone'}
               , {"str": 'just eat', 'repl': 'Just Eat', 'category': 'Discretionary: Entertainment'}
               , {"str": 's j p fish', 'repl': in_str, 'category': 'Basics: Rent'}
              ]
    
    for search_string in mapping:
        if search_string['str'] in in_str.lower():
            out_str = search_string['repl']
            category = search_string['category']
        
    if 'pizza' in in_str.lower() and 'hut' in in_str.lower():
        out_str = 'Pizza Hut'
        category = 'Discretionary: Entertainment'
    elif 'papa' in in_str.lower() and 'john' in in_str.lower():
        out_str = 'Papa Johns'
        category = 'Discretionary: Entertainment'
        
    return out_str, category

In [232]:
def process_ynab_csv(df_param: pd.DataFrame, statement_date: date) -> pd.DataFrame:
    cols = ['StatementDate', 'TransactionDate', 'Payee', 'Amount']
    df_ynab = df_param.loc[(df_param['StatementDate'] == statement_date) & 
                           (df_param['Amount'].notnull()), cols]
    
    df_ynab['Category'] = ""
    df_ynab['Memo'] = ""
    df_ynab.loc[df_ynab['Amount'] > 0, 'Inflow'] = df_ynab['Amount']
    df_ynab.loc[df_ynab['Amount'] < 0, 'Inflow'] = ""

    df_ynab.loc[df_ynab['Amount'] < 0, 'Outflow'] = df_ynab['Amount'] * -1
    df_ynab.loc[df_ynab['Amount'] > 0, 'Outflow'] = ""

    df_ynab.loc[:, 'Date'] = pd.to_datetime(df_ynab['TransactionDate']).dt.strftime('%Y-%m-%d')
    
    # Transform columns
    df_ynab[['Payee', 'Category']] = df_ynab['Payee'].apply(transform_payee).apply(pd.Series)
    
    out_cols = ['Date', 'Payee', 'Category', 'Memo', 'Outflow', 'Inflow']
    
    return df_ynab.loc[:, out_cols]

In [233]:
for statement in sorted(statements):
    out_path = path + 'YNAB\\' + 'HSBC_DC_' + pd.to_datetime(statement).strftime('%Y%m%d') + '.csv'
    df_out = process_ynab_csv(df_a, statement)
    print('Printing to [{out}]'.format(out=out_path))
    df_out.to_csv(out_path, index = False)

Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20140825.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20140925.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20141025.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20141125.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20141225.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150125.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150225.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150325.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150425.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150525.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150625.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150725.csv]
Printing to [../../../../MyDocuments/Logs/HSBC/YNAB\HSBC_DC_20150825.csv]
Printing to [../../../../MyDocuments/L