# AH Bonnen parser

In [3]:
import os
import re
import io
import PyPDF2
from PIL import Image
import pandas as pd
from pdf2image import convert_from_path
import pytesseract


In [1]:
def receipt_renamer(folder_path):
    """
    Function renames the download ah receipts to a more sortable name
    Input: A path to a folder with ah receipt pdfs. 
    """

    # Get a list of all the files in the directory
    files = os.listdir(folder_path)
    # Loop through each file in the directory
    for file in files:
        # Use regular expressions to extract the date from the filename
        match = re.search(r'\d{2}-\d{2}-\d{2}', file)
        if match:
            # Extract the date from the match object
            date = match.group()
            # Convert the date to the desired format (YYYY-MM-DD)
            new_name = re.sub(r'(\d{2})-(\d{2})-(\d{2})', r'20\3-\2-\1', date) + '_AH_kassabon'
            # Rename the file with the new name
            os.rename(os.path.join(folder_path, file), os.path.join(folder_path, new_name + '.pdf'))
        print('succesfully renamed all files')

In [116]:
# Set the directory where the files are located
dir_path = 'C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen'

img_path = 'C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen_img\\'

In [117]:
def pdf_to_img(pdf_path, pdf_img_path):

    # Get a list of all the files in the directory
    files = os.listdir(pdf_path)
    # Loop through each file in the directory
    for pdf_file in files:

        full_path = os.path.join(pdf_path, pdf_file)
        # Convert the first page of the PDF file to a PIL image
        images = convert_from_path(full_path, first_page=1, last_page=1)
        image = images[0]
        # obtain the pdf name to name the image.
        pdf_file = pdf_file.replace('.pdf','.jpg')
        # Save the image to a file
        image.save(pdf_img_path + pdf_file)
        
    print('succesfully converted all pdf files to images')

In [118]:
def text_to_df(text):

    temp_df = pd.DataFrame(columns=['Amount','Product','Price'])

    text_without_start = text.split('BONUSKAART xx9644', 1)[1]
    text_without_end = text_without_start.split('UW VOORDEEL',1)[0]
    lines = text_without_end.split("\n")

    match = re.search(r"\d{1,2}-\d{1,2}-\d{4}", text)
    if match:
        date = match.group()
        date_without_time = re.sub(r"\s+\d{1,2}:\d{1,2}\s*", " ", date)

    for line in lines:
        if line:
            parts = line.split()
            amount = parts[0]
            price = parts[-1]
            if 'KG' in quantity:
                product_name = " ".join(parts[1:-2])
            else:
                product_name = " ".join(parts[1:-1])

            temp_df.loc[len(temp_df)] = [amount, product_name, price]

    temp_df['Date'] = pd.to_datetime(date_without_time, format='%d-%m-%Y').date()

    return temp_df 

In [134]:
def text_from_img(pdf_img_path):

    df = pd.DataFrame(columns=['Amount','Product','Price','Date'])

    # Get a list of all the files in the directory
    files = os.listdir(pdf_img_path)
    # Loop through each file in the directory
    for img_file in files:

        full_path = os.path.join(pdf_img_path, img_file)
        image = Image.open(image_path)
        # Convert the image to grayscale
        image = image.convert('L')
        # Process the image with pytesseract
        text = pytesseract.image_to_string(image)
        temp_df = text_to_df(text)
        df = pd.concat([df, temp_df]).reset_index(drop=True)

    return df


In [135]:
test = text_from_img('C:\\Users\\rie12\\Documents\\GitHub\\AH_kassabonnen\\ah_bonnen_img')

In [136]:
test

Unnamed: 0,Amount,Product,Price,Date
0,1,AH ICETEA,089,2023-01-03
1,1,OETKER PIZZA,419,2023-01-03
2,1,JELLY BEANS,129,2023-01-03
3,1,LAY'S OVEN B,209,2023-01-03
4,1,PEPPERMINT,139,2023-01-03
...,...,...,...,...
328,1,PEPPERMINT,139,2023-01-03
329,si,ZAANS BRUIN,085,2023-01-03
330,1.342KG,"BANANEN 1,99",267,2023-01-03
331,1,FOCACCIAPEPP,125,2023-01-03
