# AH Bonnen parser

In [5]:
import os
import re
import io
import PyPDF2
from PIL import Image
import pandas as pd
from pdf2image import convert_from_path
import pytesseract

In [6]:
def receipt_renamer(folder_path):
    """
    Function renames the download ah receipts to a more sortable name
    Input: A path to a folder with ah receipt pdfs. 
    """

    # Get a list of all the files in the directory
    files = os.listdir(folder_path)
    # Loop through each file in the directory
    for file in files:
        # Use regular expressions to extract the date from the filename
        match = re.search(r'\d{2}-\d{2}-\d{2}', file)
        if match:
            # Extract the date from the match object
            date = match.group()
            # Convert the date to the desired format (YYYY-MM-DD)
            new_name = re.sub(r'(\d{2})-(\d{2})-(\d{2})', r'20\3-\2-\1', date) + '_AH_kassabon'
            # Rename the file with the new name
            os.rename(os.path.join(folder_path, file), os.path.join(folder_path, new_name + '.pdf'))
        print('succesfully renamed all files')

In [7]:
def pdf_to_img(pdf_path, pdf_img_path):

    # Get a list of all the files in the directory
    files = os.listdir(pdf_path)
    # Loop through each file in the directory
    for pdf_file in files:

        full_path = os.path.join(pdf_path, pdf_file)
        # Convert the first page of the PDF file to a PIL image
        images = convert_from_path(full_path, first_page=1, last_page=1)
        image = images[0]
        # obtain the pdf name to name the image.
        pdf_file = pdf_file.replace('.pdf','.jpg')
        # Save the image to a file
        image.save(pdf_img_path + pdf_file)
        
    print('succesfully converted all pdf files to images')

In [19]:
images = convert_from_path("C:\\Users\\rie12\\Desktop\\AH_kassabon_24-01-23_1350_1093.pdf", first_page=1, last_page=1)
image = images[0]
# Save the image to a file
image.save('testing_img_24-01-23.jpg')

In [None]:
pdf_to_img("C:\Users\rie12\Desktop\AH_kassabon_24-01-23_1350_1093.pdf")

In [39]:
def text_to_df(text, img_file):

    temp_df = pd.DataFrame(columns=['Amount','Product','Price'])
    try:
        text_without_start = text.split('BONUSKAART xx9644', 1)[1]
        text_without_end = text_without_start.split('UW VOORDEEL',1)[0]
        lines = text_without_end.split("\n")
    except Exception as ex:
        print(img_file, ' did not work because', ex)
        return temp_df

    match = re.search(r"\d{1,2}-\d{1,2}-\d{4}", text)
    if match:
        date = match.group()
        date_without_time = re.sub(r"\s+\d{1,2}:\d{1,2}\s*", " ", date)

    for line in lines:
        # Create an I that decides which part should be the amount.
        i = -1
        if line:
            parts = line.split()
            amount = parts[0]

            # Ignore the bonus box lines.
            if 'BBOX' in amount or 'BONUS' in amount:
                continue
            # The bonus and bonux box lines are the last on receipt.
            # Therefore they have to be ignored to get the price.
            while parts[i] == 'B' or parts[i] == 'BB':
                i -=1
            
            # If the amount of the product is expressed in KG, we want to grab the kg price
            # And remove KG from the line
            if 'KG' in amount:
                i -=1
                product_name = " ".join(parts[1:i])
                amount = amount.replace('KG','')
            else:
                product_name = " ".join(parts[1:i])

            # Assign price and make it a float
            # Also split price if the value is not right
            price = parts[i]
            price = price.replace(',', '.')
            try:
                if float(price) > 100:
                    price = price[:1] + '.' + price[1:]
                price = float(price)
            except Exception as ex2:
                print('Encounterd: ', ex2, 'For ', product_name)
                price = 999

            temp_df.loc[len(temp_df)] = [amount, product_name, price]

    temp_df['Amount'] = pd.to_numeric(temp_df['Amount'], errors='coerce').fillna(1)
    temp_df['Date'] = pd.to_datetime(date_without_time, format='%d-%m-%Y').date()
    # Multiple the amount bought by the price to get the right price
    # Known issue: Messes up subtotaal.
    # If price is not a float it will return None
    temp_df['Price_Amount'] = temp_df.apply(lambda x: round(x['Price'] * x['Amount'], 2) if isinstance(x['Price'], float) else None, axis=1)

    return temp_df 

In [40]:
def text_from_img(pdf_img_path):

    df = pd.DataFrame(columns=['Amount','Product','Price','Date'])

    # Get a list of all the files in the directory
    files = os.listdir(pdf_img_path)
    # Loop through each file in the directory
    for img_file in files:

        full_path = os.path.join(pdf_img_path, img_file)
        image = Image.open(full_path)
        # Convert the image to grayscale
        image = image.convert('L')
        # Process the image with pytesseract
        text = pytesseract.image_to_string(image)
        temp_df = text_to_df(text, img_file)
        df = pd.concat([df, temp_df]).reset_index(drop=True)

    return df


In [41]:
# Set the directory where the files are located
dir_path = 'C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen'

img_path = 'C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen_img\\'

In [42]:
test = text_from_img('C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen_img')

2023-01-24_AH_kassabon.jpg  did not work because list index out of range
2023-02-06_AH_kassabon.jpg  did not work because list index out of range
2023-02-15_AH_kassabon.jpg  did not work because list index out of range
Encounterd:  could not convert string to float: '#B' For  FRIKANDELBR 100
2023-03-11_AH_kassabon.jpg  did not work because list index out of range
2023-03-16_AH_kassabon.jpg  did not work because list index out of range
2023-03-20_AH_kassabon.jpg  did not work because list index out of range
2023-04-05_AH_kassabon.jpg  did not work because list index out of range
2023-04-13_AH_kassabon.jpg  did not work because list index out of range
2023-04-17_AH_kassabon.jpg  did not work because list index out of range
2023-04-19_AH_kassabon.jpg  did not work because list index out of range
Encounterd:  could not convert string to float: '40%' For  STOOMMAALTIS 5,79
Encounterd:  could not convert string to float: 'a.99' For  TUC PAPRIKA
Encounterd:  could not convert string to float:

In [43]:
test.head(50)

Unnamed: 0,Amount,Product,Price,Date,Price_Amount
0,1.0,AH ICETEA,0.89,2023-01-03,0.89
1,1.0,OETKER PIZZA,4.19,2023-01-03,4.19
2,1.0,JELLY BEANS,1.29,2023-01-03,1.29
3,1.0,LAY'S OVEN B,2.09,2023-01-03,2.09
4,1.0,PEPPERMINT,1.39,2023-01-03,1.39
5,1.0,ZAANS BRUIN,0.85,2023-01-03,0.85
6,1.342,BANANEN,1.99,2023-01-03,2.67
7,1.0,FOCACCIAPEPP,1.25,2023-01-03,1.25
8,8.0,SUBTOTAAL,14.62,2023-01-03,116.96
9,1.0,DZH HV MELK,1.19,2023-01-05,1.19


In [45]:
test[test['Product'] == 'SUBTOTAAL']

Unnamed: 0,Amount,Product,Price,Date,Price_Amount
8,8.0,SUBTOTAAL,14.62,2023-01-03,116.96
25,16.0,SUBTOTAAL,27.41,2023-01-05,438.56
40,14.0,SUBTOTAAL,33.71,2023-01-08,471.94
53,12.0,SUBTOTAAL,24.56,2023-01-11,294.72
70,17.0,SUBTOTAAL,39.84,2023-01-17,677.28
84,15.0,SUBTOTAAL,26.79,2023-01-20,401.85
101,16.0,SUBTOTAAL,28.88,2023-01-28,462.08
116,14.0,SUBTOTAAL,34.08,2023-02-01,477.12
128,12.0,SUBTOTAAL,28.95,2023-02-08,347.4
141,12.0,SUBTOTAAL,26.72,2023-02-12,320.64
