In [1]:
import os

# Define constants
INPUT_DIR = "data/input"
OUTPUT_DIR = "data/output"
INSTITUTION = "Kotak"
STATEMENT_PASSWORD = "123"  # CRN

if STATEMENT_PASSWORD == "123":
    raise Exception("Please change password to actual one")

# Ensure output dir exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [2]:
# load common functions
from importlib.machinery import SourceFileLoader

common_lib = SourceFileLoader("common_lib", "../common_lib/common_functions.py").load_module()

from common_lib import *

In [3]:
from datetime import datetime

def extract_date_old(date_str):
    dt_sr = date_str.split("Date AsOn ")[-1].strip()
    return datetime.strptime(dt_sr, "%d-%b-%y").date()

def extract_date_new(date_str):
    dt_sr = date_str.split("Summary of all your assets and liabilities as on")[-1].strip()
    return datetime.strptime(dt_sr, "%d %b, %Y").date()

In [4]:
import re

def extract_balance_old(line):
    return float(line.replace(" (Cr)", "").split(" ")[-1].strip().replace(",", ""))

def extract_balance_new(line):
    regexp = re.compile(r"Savings Account.*?(\d+,\d+\.\d+).*Closing balance")
    m = re.match(regexp, line)
    if m:
        return float(m.group(0).replace(",", ""))

In [5]:
import re

def extract_balances(page_text):
    output = []
    statement_date = None

    for line in page_text.splitlines():
        type = None
        if "Date AsOn " in line:
            statement_date = extract_date_old(line)
            continue
        elif "Summary of all your assets and liabilities as on" in line:
            statement_date = extract_date_new(line)
            continue
        elif "Savings Account (s)" in line or re.search(r"Closing Balance.*?(\d+,\d+\.\d+).*\(Cr\)", line):
            type = "Savings"
        elif "Term Deposit (s)" in line:
            type = "Deposit"

        if type is not None:
            inr_balance = extract_balance_old(line)
            output.append(f"{type},{inr_balance:.2f},,")

    final_output = []
    for l in output:
        final_output.append(f"{statement_date.isoformat()},{INSTITUTION},{l}")
    return [statement_date, final_output]

In [8]:
def generate_monthly_balances():
    pdf_paths = get_all_pdf_paths(INPUT_DIR)
    final_output = []

    for pdf in pdf_paths:
        reader = get_pdf_reader(pdf, STATEMENT_PASSWORD)
        statement_date, output = extract_balances(reader.pages[0].extract_text())

        if statement_date is None:
            print("Date not parseable for file ", pdf)
            continue
        if output is None or len(output) == 0:
            print("Output not present for date ", statement_date.isoformat(), " file ", pdf)
            continue

        final_output += output

        output_path = f"{OUTPUT_DIR}/{INSTITUTION}-Statement_{statement_date.year}-{statement_date.month:02}.pdf"
        write_pdf(reader, output_path)
    
    return final_output

In [None]:
output = generate_monthly_balances()
output.sort(reverse=True)
with open(f"{OUTPUT_DIR}/{INSTITUTION}-Monthly-Balances.csv", "w") as f:
    f.write("Date,Institution,Account Type,Balance INR,Balance USD,Comments\n")
    for line in output:
        f.write(f"{line}\n")