In [None]:
"""
Utility function for showing images.

Intended to be imported in Jupyter notebooks to display pixmap images.

Invocation: "show_image(item, title)", where item is a PyMuPDF object
which has a "get_pixmap" method, and title is an optional string.

The function executes "item.get_pixmap(dpi=150)" and show the resulting
image.


Dependencies
------------
numpy, matplotlib, pymupdf
"""


def show_image(item, title=""):
    """Display a pixmap.

    Just to display Pixmap image of "item" - ignore the man behind the curtain.

    Args:
        item: any PyMuPDF object having a "get_pixmap" method.
        title: a string to be used as image title

    Generates an RGB Pixmap from item using a constant DPI and using matplotlib
    to show it inline of the notebook.
    """
    DPI = 150  # use this resolution
    import numpy as np
    import matplotlib.pyplot as plt

    # %matplotlib inline
    pix = item.get_pixmap(dpi=DPI)
    img = np.ndarray([pix.h, pix.w, 3], dtype=np.uint8, buffer=pix.samples_mv)
    plt.figure(dpi=DPI)  # set the figure's DPI
    plt.title(title)  # set title of image
    _ = plt.imshow(img, extent=(0, pix.w * 72 / DPI, pix.h * 72 / DPI, 0))

In [None]:
import pandas as pd  # import pandas
import fitz  # import PyMuPDF
if not hasattr(fitz.Page, "find_tables"):
    raise RuntimeError("This PyMuPDF version does not support the table feature")

In [None]:
from pathlib import Path
doc = fitz.open(list((Path.cwd().parent / "data" / "statements").glob("*.pdf"))[2])  # open example file
page = doc[0]  # read first page to demo the layout
show_image(page,"First Page Content")

In [None]:
page.find_tables().tables[0].to_pandas()

In [None]:
page.get_text()

In [None]:
print(page.get_text())

In [None]:
import re

## LaBanquePostale

In [None]:
from pathlib import Path
from parse_bank_statement import read
import pandas as pd
import numpy as np
import fitz
import re

In [None]:
lbp_files = sorted(list((Path.cwd().parent / "data" / "statements" / "LaBanquePostale_clean").glob("*.pdf")))
transactions_df = [read.Statement(pdf=Path(f)).transactions for f in lbp_files]
statements = pd.concat(transactions_df)
statements.sort_index()

In [None]:
statements.groupby("account_id").amount.sum()

In [None]:
s = read.Statement(pdf=Path(r"/Users/plfelter/dev/parse-bank-statement/data/statements/LaBanquePostale/Relevé de compte(10).pdf"))
s.emission_date

In [None]:
lbp_files = sorted(list((Path.cwd().parent / "data" / "statements" / "LaBanquePostale_clean").glob("*.pdf")))
pd.DataFrame({
    "len":[read.Statement(pdf=Path(f)).accounts.shape[0] for f in lbp_files],
    "file": [f.stem for f in lbp_files]
})