In [2]:
import os

# Define constants
INPUT_DIR = "data/input"
OUTPUT_DIR = "data/output"
INSTITUTION = "SBI"
STATEMENT_PASSWORD = "123"  # Last 5 digits of mobile + birth-date in DDMMYY format

if STATEMENT_PASSWORD == "123":
    raise Exception("Please change password to actual one")

# Ensure output dir exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
# load common functions
from importlib.machinery import SourceFileLoader

common_lib = SourceFileLoader("common_lib", "../common_lib/common_functions.py").load_module()

from common_lib import *

In [4]:
from datetime import datetime

def extract_date(date_str):
    dt_sr = date_str.split("As on ")[-1].strip() # As on 30-04-22
    return datetime.strptime(dt_sr, "%d-%m-%y").date()

In [5]:
import re

def extract_balance(line):
    results = re.findall(r"\d+\.\d{2}", line) # 18871.99
    return float(results[0])

In [6]:
import re

def extract_balances(page_text):
    output = []
    statement_date = None

    for line in page_text.splitlines():
        type = None
        if line.startswith("As on "):
            statement_date = extract_date(line)
            continue
        elif line.startswith("SAVING ACCOUNT INR "):
            type = "Savings"
        elif line.startswith("TERM DEPOSIT INR "):
            type = "Deposit"
        elif line.startswith("PPF INR "):
            type = "PPF"

        if type is not None:
            inr_balance = extract_balance(line)
            output.append(f"{type},{inr_balance:.2f},,")

    final_output = []
    for l in output:
        final_output.append(f"{statement_date.isoformat()},{INSTITUTION},{l}")
    return [statement_date, final_output]

In [11]:
def generate_monthly_balances():
    pdf_paths = get_all_pdf_paths(INPUT_DIR)
    final_output = []

    for pdf in pdf_paths:
        reader = get_pdf_reader(pdf, STATEMENT_PASSWORD)
        statement_date, output = extract_balances(reader.pages[0].extract_text())

        if statement_date is None:
            print("Date not parseable for file ", pdf)
            continue
        if output is None or len(output) == 0:
            print("Output not present for date ", statement_date.isoformat(), " file ", pdf)
            continue

        final_output += output

        output_path = f"{OUTPUT_DIR}/{INSTITUTION}-Statement_{statement_date.year}-{statement_date.month:02}.pdf"
        write_pdf(reader, output_path)
    
    return final_output

In [12]:
output = generate_monthly_balances()
output = list(set(output)) # remove duplicates
output.sort(reverse=True)
with open(f"{OUTPUT_DIR}/{INSTITUTION}-Monthly-Balances.csv", "w") as f:
    f.write("Date,Institution,Account Type,Balance INR,Balance USD,Comments\n")
    for line in output:
        f.write(f"{line}\n")