In [2]:
import pandas as pd
import re

# Manually extracted text from the PDF
extracted_text = """
11:57 AM14 Jun 24 Transfer out toSecurities DATAGW -16,050.42 152,926.48
3:06 AM14 Jun 24 Auto Deposit of Incoming Transfer Auto +2,198.00 168,976.90
3:05 AM13 Jun 24 Auto Deposit of Incoming Transfer Auto +11,375.10 166,778.90
3:05 AM13 Jun 24 Auto Deposit of Incoming Transfer Auto +2,580.30 155,403.80
1:02 PM12 Jun 24 Transfer out Mobile -22,800.00 152,823.50
11:56 AM12 Jun 24 Transfer out toSecurities DATAGW -43,110.22 175,623.50
3:06 AM12 Jun 24 Auto Deposit of Incoming Transfer Auto +1,134.00 218,733.72
9:02 AM11 Jun 24 Transfer in KTB +1,667.00 217,599.72
9:42 PM10 Jun 24 Transfer in KTB +1,700.00 215,932.72
3:06 PM10 Jun 24 Transfer out Mobile -49,000.00 214,232.72
"""

# Split the text into lines
lines = extracted_text.strip().split('\n')

# Print lines to debug
print("Extracted lines:")
for line in lines:
    print(line)

# Define a pattern to match the relevant lines containing transaction details
pattern = re.compile(r'(\d+:\d+\s[APM]+)\s(\d+\s\w+\s\d+)\s([\w\s]+)\s([\w\s]+)\s([\+\-]\d+,\d+\.\d+)\s(\d+,\d+\.\d+)')

# Parse the transactions into structured data
data = []
for line in lines:
    match = pattern.search(line)
    if match:
        time = match.group(1)
        date = match.group(2)
        transaction = match.group(3)
        channel = match.group(4)
        amount = match.group(5)
        balance = match.group(6)
        
        # Determine transaction type
        transaction_type = 'Transfer in' if '+' in amount else 'Transfer out'
        
        # Clean and convert amount and balance
        amount = float(amount.replace('+', '').replace('-', '').replace(',', ''))
        balance = float(balance.replace(',', ''))
        
        data.append([date, time, transaction_type, channel, amount, balance])

# Print data to debug
print("Parsed data:")
for row in data:
    print(row)

# Verify that data is parsed correctly
if not data:
    print("No data parsed.")
else:
    print(f"Parsed {len(data)} rows of data.")

# Create DataFrame
columns = ['Date', 'Time', 'Transaction', 'Channel', 'Amount', 'Available balance']
df = pd.DataFrame(data, columns=columns)

# Print DataFrame to verify the data
print("DataFrame:")
print(df)

# Save the DataFrame to an Excel file
output_file_path = 'structured_transactions_output.xlsx'
df.to_excel(output_file_path, index=False)

print(f'DataFrame saved to {output_file_path}')




Extracted lines:
11:57 AM14 Jun 24 Transfer out toSecurities DATAGW -16,050.42 152,926.48
3:06 AM14 Jun 24 Auto Deposit of Incoming Transfer Auto +2,198.00 168,976.90
3:05 AM13 Jun 24 Auto Deposit of Incoming Transfer Auto +11,375.10 166,778.90
3:05 AM13 Jun 24 Auto Deposit of Incoming Transfer Auto +2,580.30 155,403.80
1:02 PM12 Jun 24 Transfer out Mobile -22,800.00 152,823.50
11:56 AM12 Jun 24 Transfer out toSecurities DATAGW -43,110.22 175,623.50
3:06 AM12 Jun 24 Auto Deposit of Incoming Transfer Auto +1,134.00 218,733.72
9:02 AM11 Jun 24 Transfer in KTB +1,667.00 217,599.72
9:42 PM10 Jun 24 Transfer in KTB +1,700.00 215,932.72
3:06 PM10 Jun 24 Transfer out Mobile -49,000.00 214,232.72
Parsed data:
No data parsed.
DataFrame:
Empty DataFrame
Columns: [Date, Time, Transaction, Channel, Amount, Available balance]
Index: []
DataFrame saved to structured_transactions_output.xlsx
