In [333]:
import numpy as np
from PIL import Image
import fitz
from pytesseract import image_to_data, Output
from io import BytesIO
import pandas as pd
from os import makedirs

In [334]:
def merge_rows(df):
    """
    Merges rows in a dataframe, merging first row with second, second with third, etc. 
    If there are less than 5 rows, fills missing values with None.
    
    Args:
        df (pd.DataFrame): Input DataFrame to merge
        
    Returns:
        pd.Series: Merged series
    """

    # Group by 'page_num' and 'par_num', then join the  't'
    merged = df.groupby(['page_num', 'par_num'])[['text','line_num']].apply(lambda g: pd.Series({'text': ' '.join(g['text']), 'line_num': g['line_num'].min()}))
    
    # If there are more than 5 rows, merge them into one until there are only five left
    while len(merged) > 5:
        merged = pd.concat([merged, merged.iloc[-1:]])
        
    return merged['text'].reset_index(level=['page_num', 'par_num'], drop=True).squeeze()

In [335]:
def crop_pdf(passed_page, start_word: str, end_word: str, dpi=300):
    pg = passed_page
    page_width = round(pg.rect.width)
    start_rect = pg.get_textpage_ocr(3, "pol", dpi, True).search(start_word)[0].rect
    end_rect = pg.get_textpage_ocr(3, "pol", dpi, True).search(end_word)[0].rect
    start_y = start_rect.round().bottom_left[1]
    end_y = end_rect.round().top_left[1]
    pg.set_cropbox(fitz.Rect(0, start_y, page_width, end_y))
    return pg

In [336]:
file_name = "a-2"
prefix = "output"

with fitz.open(f"input/{file_name}.pdf") as doc:
    try:
        makedirs(f"{prefix}/{file_name}")
        print("done")
    except FileExistsError:
        pass
    page = list(doc.pages())[0]
    # page = crop_pdf(page, f"{datetime.now().year}", "Sprzed", 72)
    # pix = page.get_pixmap(dpi=300)
    pix = page.get_pixmap(dpi=300, colorspace=fitz.csGRAY)
    pix.invert_irect(pix.irect)
    pdf_bytes = pix.tobytes()
    img = Image.open(BytesIO(pdf_bytes))
    df = image_to_data(img, "pol", config="--psm 4", output_type=Output.DATAFRAME)

In [337]:
df.drop(["width", "height", "level", "left", "top", "conf"], axis=1, inplace=True)
# df_copy = df_copy.replace("", None)
df.dropna(subset="text", inplace=True)
# print(df_copy["text"])
df["text"] = df["text"].str.lower()
shop = "auchan" if "auchan" in df["text"].array else "biedronka"
main_index = df[df["text"] == "niefiskalny"]["block_num"].values[0]
df = df[df["block_num"] == main_index]
df_copy_copy = df.copy()
df_copy_copy.to_excel(f"{prefix}/{file_name}/df_copy_copy.xlsx")
df_copy_copy = df_copy_copy.drop(["block_num"], axis=1)
df_copy_copy.to_excel(f"{prefix}/{file_name}/wat.xlsx")

df_copy_copy = df_copy_copy[df_copy_copy["par_num"] == 1]
df_copy_copy.to_excel(f"{prefix}/{file_name}/wat2.xlsx")

gbl = df_copy_copy.groupby("line_num")
gbl = gbl.filter(lambda x: x["line_num"].count() > 2)
gbl = gbl.groupby("line_num")
gbl = gbl.apply(merge_rows)
df_copy_copy.to_excel(f"{prefix}/{file_name}/wat3.xlsx")


  gbl = gbl.apply(merge_rows)


In [338]:
np_array = gbl.to_numpy()
indices = np.arange(5, len(np_array), 5)
sub_arrays = np.split(np_array, indices)
final_dframe = pd.DataFrame(sub_arrays, columns=["name", "amount", "star", "price_per_unit", "price"])
final_dframe = final_dframe.dropna()
final_dframe = final_dframe.map(lambda x: x.replace(",", "."))
# print(sub_arrays)
# print(sub_arries)
# print(pd.DataFrame(sub_arries, columns=["name", "amount", "star", "price_per_unit", "price"]))
# print(gbl[:5:])
# print(final_dframe["price"])
index = 0
try:
    index = final_dframe[final_dframe["price"] < "0"].index[0]
except IndexError:
    index = len(final_dframe["price"])
    pass

# index = final_dframe[final_dframe["price"] < "0"]


final_dframe = final_dframe[:index]
final_dframe = final_dframe.drop(["star"], axis=1)

try:
    tax_index = final_dframe[final_dframe["name"].str.lower().str.replace(".", "") == "sprzed"].index[0]
except IndexError:
    tax_index = len(final_dframe)
    pass

final_dframe = final_dframe[:tax_index]
final_dframe.to_excel(f"{prefix}/{file_name}/final_dframe.xlsx")