# OCR - US Stock
Optical Character Recognition (OCR) for Dime US Stock Transaction

In [None]:
import pandas as pd
import os
# OCR
import pytesseract
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # Path to tesseract executable  # For Apple Silicon Macs
from PIL import Image
import re # Regular expressions

In [None]:
original_dir = os.getcwd()
os.chdir('../') # go back 1 parent directory

repo_dir = os.getcwd()
# print("Repository path:", repo_dir) # ../Investment-Portfolio

os.chdir(original_dir)

In [None]:
def extract_data_from_text(text):
    """Extracts data from text for Dime US Stock Transactions

    Args:
        text (str): The OCR text from a transaction slip

    Returns:
        dict: Extracted information
    """

    # Regular expressions to extract data
    status_pattern = re.search(r'Status.*\n.*\n(.*)\n', text)
    status = status_pattern.group(1).strip() if status_pattern else None

    order_pattern = re.search(r'(Buy|Sell) (.*?)\n', text)
    if order_pattern:
        position = order_pattern.group(1).strip()
        ticker = order_pattern.group(2).strip().split()[0]
    else:
        position, ticker = None, None

    market_pattern = re.search(r'(NASDAQ|NYSE)', text)
    market = market_pattern.group(1).strip() if market_pattern else None

    thb_amount_pattern = re.search(r'(\d+\.\d+)\sTHB', text)
    thb_amount = thb_amount_pattern.group(1).strip() if thb_amount_pattern else None

    stock_amount_pattern = re.search(r'Stock Amount\s+(\d+\.\d+)\sTHB', text)
    stock_amount = stock_amount_pattern.group(1).strip() if stock_amount_pattern else None

    commission_fee_pattern = re.search(r'Commission Fee\s+(\d+\.\d+)\sTHB', text)
    commission_fee = commission_fee_pattern.group(1).strip() if commission_fee_pattern else None

    vat_pattern = re.search(r'VAT 7%\s+(\d+\.\d+)\sTHB', text)
    vat = vat_pattern.group(1).strip() if vat_pattern else None

    exchange_rate_pattern = re.search(r'Exchange Rate\s+1 USD =\s+(\d+\.\d+)\sTHB', text)
    exchange_rate = exchange_rate_pattern.group(1).strip() if exchange_rate_pattern else None

    usd_amount_pattern = re.search(r'USD Amount\s+(\d+\.\d+)\sUSD', text)
    usd_amount = usd_amount_pattern.group(1).strip() if usd_amount_pattern else None

    submission_date_pattern = re.search(r'Submission Date.*\n(.*\n.*\d{2}:\d{2})', text)
    submission_date = submission_date_pattern.group(1).strip() if submission_date_pattern else None

    order_type_pattern = re.search(r'Order Type.*\n(.*)', text)
    order_type = order_type_pattern.group(1).strip() if order_type_pattern else None

    portfolio_pattern = re.search(r'Dime! Portfolio\s+(.*)', text)
    portfolio = portfolio_pattern.group(1).strip() if portfolio_pattern else None

    offshore_account_pattern = re.search(r'Offshore Account No.\s+(.*)', text)
    offshore_account_no = offshore_account_pattern.group(1).strip() if offshore_account_pattern else None

    order_id_pattern = re.search(r'Order ID\s+(.*)\n(\d+)', text)
    order_id = order_id_pattern.group(1).strip() + order_id_pattern.group(2).strip() if order_id_pattern else None

    payment_account_name_pattern = re.search(r'Account Name\s+(.*)', text)
    payment_account_name = payment_account_name_pattern.group(1).strip() if payment_account_name_pattern else None

    payment_account_no_pattern = re.search(r'Account No.\s+(.*)', text)
    payment_account_no = payment_account_no_pattern.group(1).strip() if payment_account_no_pattern else None

    reference_id_pattern = re.search(r'Reference ID\s+(.*)', text)
    reference_id = reference_id_pattern.group(1).strip() if reference_id_pattern else None

    payment_ref_id_pattern = re.search(r'Payment Ref ID\s+(.*)', text)
    payment_ref_id = payment_ref_id_pattern.group(1).strip() if payment_ref_id_pattern else None

    receiving_account_name_pattern = re.search(r'Receiving Account\nAccount Name\s+(.*)', text)
    receiving_account_name = receiving_account_name_pattern.group(1).strip() if receiving_account_name_pattern else None

    receiving_account_no_pattern = re.search(r'Receiving Account\nAccount No.\s+(.*)', text)
    receiving_account_no = receiving_account_no_pattern.group(1).strip() if receiving_account_no_pattern else None

    receiving_ref_id_pattern = re.search(r'Receiving Ref ID\s+(.*)', text)
    receiving_ref_id = receiving_ref_id_pattern.group(1).strip() if receiving_ref_id_pattern else None

    return {
        "Status": status,
        "Position": position,
        "Ticker": ticker,
        "Market": market,
        "Total Amount (THB)": thb_amount,
        "Stock Amount (THB)": stock_amount,
        "Commission Fee (THB)": commission_fee,
        "VAT (THB)": vat,
        "Exchange Rate (1 USD to THB)": exchange_rate,
        "USD Amount": usd_amount,
        "Submission Date": submission_date,
        "Order Type": order_type,
        "Dime! Portfolio": portfolio,
        "Offshore Account No.": offshore_account_no,
        "Order ID": order_id,
        "Payment Account Name": payment_account_name,
        "Payment Account No.": payment_account_no,
        "Reference ID": reference_id,
        "Payment Ref ID": payment_ref_id,
        "Receiving Account Name": receiving_account_name,
        "Receiving Account No.": receiving_account_no,
        "Receiving Ref ID": receiving_ref_id
    }


In [None]:
folder_path = repo_dir + "/data/private/receipt/dime/us_stock_test"

data_list = [] # list of dict
texts = []
# Loop through all files in the folder
for filename in sorted(os.listdir(folder_path)):
    file_formats = [".PNG", ".JPEG", ".JPG"]

    if any(ext in filename.upper() for ext in file_formats):
        file_path = os.path.join(folder_path, filename)
        image = Image.open(file_path)

        # Perform OCR
        text = pytesseract.image_to_string(image, lang="eng")
        texts.append(text)
        # Extract data from text
        data = extract_data_from_text(text)

        # Append data to the list
        data_list.append(data)

# Create DataFrame from the list
df = pd.DataFrame(data_list)


In [None]:
df