# AH Bonnen parser

In [4]:
import os
import re
import io
import PyPDF2
from PIL import Image
import pandas as pd
from pdf2image import convert_from_path
import pytesseract

In [9]:
def receipt_renamer(folder_path):
    """
    Function renames the download ah receipts to a more sortable name
    Input: A path to a folder with ah receipt pdfs. 
    """

    # Get a list of all the files in the directory
    files = os.listdir(folder_path)
    # Loop through each file in the directory
    for file in files:
        # Use regular expressions to extract the date from the filename
        match = re.search(r'\d{2}-\d{2}-\d{2}', file)
        if match:
            # Extract the date from the match object
            date = match.group()
            # Convert the date to the desired format (YYYY-MM-DD)
            new_name = re.sub(r'(\d{2})-(\d{2})-(\d{2})', r'20\3-\2-\1', date) + '_AH_kassabon'
            # Rename the file with the new name
            os.rename(os.path.join(folder_path, file), os.path.join(folder_path, new_name + '.pdf'))
        print('succesfully renamed all files')

In [11]:
def pdf_to_img(pdf_path, pdf_img_path):

    # Get a list of all the files in the directory
    files = os.listdir(pdf_path)
    # Loop through each file in the directory
    for pdf_file in files:

        full_path = os.path.join(pdf_path, pdf_file)
        # Convert the first page of the PDF file to a PIL image
        images = convert_from_path(full_path, first_page=1, last_page=1)
        image = images[0]
        # obtain the pdf name to name the image.
        pdf_file = pdf_file.replace('.pdf','.jpg')
        # Save the image to a file
        image.save(pdf_img_path + pdf_file)
        
    print('succesfully converted all pdf files to images')

In [23]:
def text_to_df(text, img_file):

    temp_df = pd.DataFrame(columns=['Amount','Product','Price'])
    try:
        text_without_start = text.split('BONUSKAART xx9644', 1)[1]
        text_without_end = text_without_start.split('UW VOORDEEL',1)[0]
        lines = text_without_end.split("\n")
    except Exception as ex:
        print(img_file, ' did not work because', ex)
        return temp_df

    match = re.search(r"\d{1,2}-\d{1,2}-\d{4}", text)
    if match:
        date = match.group()
        date_without_time = re.sub(r"\s+\d{1,2}:\d{1,2}\s*", " ", date)

    for line in lines:
        if line:
            parts = line.split()
            amount = parts[0]
            price = parts[-1]
            if 'KG' in amount:
                product_name = " ".join(parts[1:-2])
            else:
                product_name = " ".join(parts[1:-1])

            temp_df.loc[len(temp_df)] = [amount, product_name, price]

    temp_df['Date'] = pd.to_datetime(date_without_time, format='%d-%m-%Y').date()

    return temp_df 

In [24]:
def text_from_img(pdf_img_path):

    df = pd.DataFrame(columns=['Amount','Product','Price','Date'])

    # Get a list of all the files in the directory
    files = os.listdir(pdf_img_path)
    # Loop through each file in the directory
    for img_file in files:

        full_path = os.path.join(pdf_img_path, img_file)
        image = Image.open(full_path)
        # Convert the image to grayscale
        image = image.convert('L')
        # Process the image with pytesseract
        text = pytesseract.image_to_string(image)
        temp_df = text_to_df(text, img_file)
        df = pd.concat([df, temp_df]).reset_index(drop=True)

    return df


In [1]:
# Set the directory where the files are located
dir_path = 'C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen'

img_path = 'C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen_img\\'

In [2]:
test = text_from_img('C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen_img')

NameError: name 'text_from_img' is not defined

In [6]:
image = Image.open('ah_bonnen_img\\2023-01-05_AH_kassabon.jpg')
# Convert the image to grayscale
image = image.convert('L')
# Process the image with pytesseract
text = pytesseract.image_to_string(image)

In [8]:
print(text)

@

Albert Heijn
Eikenlaan 39
050 - 5778456

AANTAL OMSCHRIJVING PRIS BEDRAG
BONUSKAART xx9644
1 DZH HV MELK 1,19
1 TUC PAPRIKA 0,95
1 AH CHOCO 185
1 DZH CREME FR 0,99
1 LAY'S OVEN B 2,09 BB
1 CUP A SOUP 199 BB
0.121KG GRILLWORST 18,00 2,18 B
1 KIPFILETBLOK 4,75 B
1 AH CABANOSSI 2,69
1 AH HAMBURGER 2,99 BB
1 CUP A SOUP 1,79 BB
1 AH RUCOLA 1,09
a AH ITAL BOL 0,79
1 SCHNITT WIT 0,29
1 PAPRIKA 1,09
1 PREI 0,69
16 SUBTOTAAL 27,41
BBOX LAYS OVEN -0,59
BBOX HAMBURGER2ST -0,74
BBOX UNOXCAS -0,95
BONUS AHDELIGENIET -0,22
BONUS AHSCHKIPFILE -1,19
UW VOORDEEL 3,69
Waarvan
BONUS BOX 2,28
TOTAAL 23,72
SPAARACTIES:
2 OVENZEGEL
BETAALD MET:
PINNEN 23,72
POI: 50271904
KLANTTICKET Terminal SJ8YX6
Merchant 1315641 Perlode 3005
Transactle 02122643 Maestro
(A0000000043060) MAESTRO
Kaart 673336XXXXXXXXX2010 Kaartserlenummer 4
BETALING Datum 05/01/2023 17:12
Autorlsatlecode 014E40 Totaal 23,72 EUR
Contactless Leesmethode CHIP
BIW OVER EUR
9% 21,76 1,96
TOTAAL 21,76 1,96
1093 40 39
17:10 5-1-2023



In [28]:
temp_df = pd.DataFrame(columns=['Amount','Product','Price'])
try:
    text_without_start = text.split('BONUSKAART xx9644', 1)[1]
    text_without_end = text_without_start.split('UW VOORDEEL',1)[0]
    lines = text_without_end.split("\n")
except Exception as ex:
    print(img_file, ' did not work because', ex)

match = re.search(r"\d{1,2}-\d{1,2}-\d{4}", text)
if match:
    date = match.group()
    date_without_time = re.sub(r"\s+\d{1,2}:\d{1,2}\s*", " ", date)

for line in lines:
    i = -1
    if line:
        parts = line.split()
        amount = parts[0]

        # Ignore the bonus box lines.
        if 'BBOX' in amount or 'BONUS' in amount:
            continue

        while parts[i] == 'B' or parts[i] == 'BB':
            i -=1
        
        if 'KG' in amount:
            i -=1
            product_name = " ".join(parts[1:i])
        else:
            product_name = " ".join(parts[1:i])

        price = parts[i]



        temp_df.loc[len(temp_df)] = [amount, product_name, price]

temp_df['Date'] = pd.to_datetime(date_without_time, format='%d-%m-%Y').date()

In [29]:
text_without_end

"\n1 DZH HV MELK 1,19\n1 TUC PAPRIKA 0,95\n1 AH CHOCO 185\n1 DZH CREME FR 0,99\n1 LAY'S OVEN B 2,09 BB\n1 CUP A SOUP 199 BB\n0.121KG GRILLWORST 18,00 2,18 B\n1 KIPFILETBLOK 4,75 B\n1 AH CABANOSSI 2,69\n1 AH HAMBURGER 2,99 BB\n1 CUP A SOUP 1,79 BB\n1 AH RUCOLA 1,09\na AH ITAL BOL 0,79\n1 SCHNITT WIT 0,29\n1 PAPRIKA 1,09\n1 PREI 0,69\n16 SUBTOTAAL 27,41\nBBOX LAYS OVEN -0,59\nBBOX HAMBURGER2ST -0,74\nBBOX UNOXCAS -0,95\nBONUS AHDELIGENIET -0,22\nBONUS AHSCHKIPFILE -1,19\n"

In [30]:
temp_df

Unnamed: 0,Amount,Product,Price,Date
0,1,DZH HV MELK,119,2023-01-05
1,1,TUC PAPRIKA,95,2023-01-05
2,1,AH CHOCO,185,2023-01-05
3,1,DZH CREME FR,99,2023-01-05
4,1,LAY'S OVEN B,209,2023-01-05
5,1,CUP A SOUP,199,2023-01-05
6,0.121KG,GRILLWORST,1800,2023-01-05
7,1,KIPFILETBLOK,475,2023-01-05
8,1,AH CABANOSSI,269,2023-01-05
9,1,AH HAMBURGER,299,2023-01-05


In [26]:
temp_df

Unnamed: 0,Amount,Product,Price,Date
0,1,DZH HV MELK,119,2023-01-05
1,1,TUC PAPRIKA,95,2023-01-05
2,1,AH CHOCO,185,2023-01-05
3,1,DZH CREME FR,99,2023-01-05
4,1,"LAY'S OVEN B 2,09",209,2023-01-05
5,1,CUP A SOUP 199,199,2023-01-05
6,0.121KG,"GRILLWORST 18,00",1800,2023-01-05
7,1,"KIPFILETBLOK 4,75",475,2023-01-05
8,1,AH CABANOSSI,269,2023-01-05
9,1,"AH HAMBURGER 2,99",299,2023-01-05
