In [98]:
pip install PyPDF2 pandas openpyxl



In [99]:
import re
import pandas as pd
from PyPDF2 import PdfReader

In [100]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

In [101]:
def process_transaction_table(text):
    transaction_records = []
    pattern = re.compile(
        r'(\d{2}-[A-Za-z]{3}-\d{4})\s+'
        r'([A-Z].*?)\s+'
        r'([\d,]+\.\d{2}[A-Za-z]*)\s+'
        r'([\d,]+\.\d{2}[A-Za-z]*)'
    )

    matches = pattern.findall(text)
    for match in matches:
        transaction_records.append({
            'Date': match[0],
            'Description': match[1].strip(),
            'Amount': match[2],
            'Balance': match[3]
        })

    return transaction_records

In [102]:
def process_account_info(text):
    account_info = {}

    info_pattern = re.compile(
        r'([A-Za-z/ ]+)\s*:\s*([^\n]+)'
    )

    matches = info_pattern.findall(text)
    for match in matches:
        key = match[0].strip()
        value = match[1].strip()
        account_info[key] = value

    return account_info

In [103]:
def extract_data_from_pdf(pdf_path):
    text = extract_text_from_pdf(pdf_path)

    account_info = process_account_info(text)

    transactions = process_transaction_table(text)

    return {
        'account_info': account_info,
        'transactions': transactions
    }

In [104]:
def save_to_excel(data, output_path):
    with pd.ExcelWriter(output_path) as writer:

        account_df = pd.DataFrame.from_dict(data['account_info'], orient='index', columns=['Value'])
        account_df.to_excel(writer, sheet_name='Account Info')

        trans_df = pd.DataFrame(data['transactions'])
        trans_df.to_excel(writer, sheet_name='Transactions', index=False)

In [105]:
if __name__ == "__main__":
    input_pdf = "test3 (1).pdf"
    output_excel = "test3_output.xlsx"

    extracted_data = extract_data_from_pdf(input_pdf)
    save_to_excel(extracted_data, output_excel)

    print(f"Successfully extracted data and saved to {output_excel}")

Successfully extracted data and saved to test3_output.xlsx
