In [46]:
import numpy as np
from PIL import Image
import fitz
from pytesseract import image_to_data, Output
from io import BytesIO
import pandas as pd
from os import makedirs

In [47]:
def apply_smth(df: pd.DataFrame):
    to_be_merged = []
    target_words = 4
    remaining_words = df.shape[0] - target_words
    # df = df.filter(lambda x: x["par_num"] < 3)
    df = df.reset_index()
    if remaining_words == 0:
        return df["text"]
    if remaining_words < 0:
        return
    for i in range(remaining_words):
        to_be_merged.append(df.loc[i, "text"].capitalize())
        if i != 0:
            df = df.drop(i)
    df.loc[0, "text"] = "".join(to_be_merged)
    return df["text"]

In [48]:
def crop_pdf(passed_page, start_word: str, end_word: str, dpi=300):
    pg = passed_page
    page_width = round(pg.rect.width)
    start_rect = pg.get_textpage_ocr(3, "pol", dpi, True).search(start_word)[0].rect
    end_rect = pg.get_textpage_ocr(3, "pol", dpi, True).search(end_word)[0].rect
    start_y = start_rect.round().bottom_left[1]
    end_y = end_rect.round().top_left[1]
    pg.set_cropbox(fitz.Rect(0, start_y, page_width, end_y))
    return pg

In [49]:
file_name = "a-2"
prefix = "output"

with fitz.open(f"input/{file_name}.pdf") as doc:
    try:
        makedirs(f"{prefix}/{file_name}")
        print("done")
    except FileExistsError:
        pass
    page = list(doc.pages())[0]
    pix = page.get_pixmap(dpi=300, colorspace=fitz.csGRAY)
    pix.invert_irect(pix.irect)
    pdf_bytes = pix.tobytes()
    img = Image.open(BytesIO(pdf_bytes))
    df = image_to_data(img, "pol", config="--psm 4", output_type=Output.DATAFRAME)
    df.to_excel(f"{prefix}/{file_name}/df.xlsx")

In [50]:
df_copy = df.copy()
df_copy = df_copy.drop(["width", "height", "level", "left", "top", "conf"], axis=1)
df_copy = df_copy.dropna(subset="text")
df_copy["text"] = df_copy["text"].str.lower()
shop = "auchan" if "auchan" in df_copy["text"].array else "biedronka"
main_index = df_copy[df_copy["text"] == "niefiskalny"]["block_num"].values[0]
df_copy = df_copy[df_copy["block_num"] == main_index]
df_copy_copy = df_copy.copy()
df_copy_copy.to_excel(f"{prefix}/{file_name}/df_copy_copy.xlsx")
df_copy_copy = df_copy_copy.drop(["block_num"], axis=1)
df_copy_copy = df_copy_copy[df_copy_copy["par_num"] == 1]
df_copy.to_excel(f"{prefix}/{file_name}/df_copy.xlsx")
gbl = df_copy_copy.groupby("line_num")
gbl = gbl.filter(lambda x: x["line_num"].count() > 2)
gbl = gbl.groupby("line_num")
gbl = gbl.apply(apply_smth, include_groups=False)
gbl = gbl.reset_index()
gbl: pd.Series = gbl[0].dropna()
gbl.to_excel(f"{prefix}/{file_name}/gbl.xlsx")
np_array = gbl.values
indices = np.arange(5, len(np_array), 5)
sub_arrays = np.split(np_array, indices)
final_dframe = pd.DataFrame(sub_arrays, columns=["name", "amount", "star", "price_per_unit", "price"])
final_dframe = final_dframe.dropna()
final_dframe = final_dframe.map(lambda x: x.replace(",", "."))
index = 0
try:
    index = final_dframe[final_dframe["price_per_unit"] < "0"].index[0]
except IndexError:
    index = len(final_dframe["price"])
    pass

final_dframe = final_dframe[:index]
final_dframe = final_dframe.drop(["star"], axis=1)
try:
    tax_index = final_dframe[final_dframe["name"].str.lower().str.replace(".", "") == "sprzed"].index[0]
except IndexError:
    tax_index = len(final_dframe["name"])
final_dframe["amount"] = final_dframe["amount"].str.replace("l", "1")
final_dframe = final_dframe[:tax_index]
final_dframe.to_excel(f"{prefix}/{file_name}/final_dframe.xlsx")