# OCR - Mutual Funds
Optical Character Recognition (OCR) for Dime Mutual Funds Transaction

In [None]:
import pandas as pd
import os
# OCR
import pytesseract
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # Path to tesseract executable  # For Apple Silicon Macs
from PIL import Image
import re # Regular expressions

In [None]:
original_dir = os.getcwd()
os.chdir('../') # go back 1 parent directory

repo_dir = os.getcwd()
# print("Repository path:", repo_dir) # ../Investment-Portfolio

os.chdir(original_dir)

In [None]:
def extract_data_from_text(text):
    """Extracts data from text for Dime Mutual Fund Transactions
    Args:
        text (str):

    Returns:
        dict:
    """

    # Regular expressions
    status_pattern = re.search(r'@\S*\s([^\n]*)\n', text)
    status = status_pattern.group(1) if status_pattern else None

    order_pattern = re.search(r'(Buy|Sell) (.*?)\n', text)
    position, ticker = (order_pattern.group(1), order_pattern.group(2)) if order_pattern else (None, None)

    thb_amount_pattern = re.search(r'(\d+,\d+.\d+) THB\n', text)
    thb_amount = thb_amount_pattern.group(1) if thb_amount_pattern else None

    submission_date_pattern = re.search(r'Submission Date (.*?)\n', text)
    submission_date = submission_date_pattern.group(1) if submission_date_pattern else None

    payment_date_pattern = re.search(r'Payment Date (.*?)\n', text)
    payment_date = payment_date_pattern.group(1) if payment_date_pattern else None

    effective_date_pattern = re.search(r'Effective Date (.*?)\n', text)
    effective_date = effective_date_pattern.group(1) if effective_date_pattern else None

    portfolio_pattern = re.search(r'Dime! Portfolio (.*?)\n', text)
    portfolio = portfolio_pattern.group(1) if portfolio_pattern else None

    unitholder_no_pattern = re.search(r'Unitholder No. (.*?)\n', text)
    unitholder_no = unitholder_no_pattern.group(1) if unitholder_no_pattern else None

    account_no_pattern = re.search(r'Account No. (.*?)\n', text)
    account_no = account_no_pattern.group(1) if account_no_pattern else None

    order_id_pattern = re.search(r'Order ID (.*?)\n', text)
    order_id = order_id_pattern.group(1) if order_id_pattern else None

    reference_id_pattern = re.search(r'Reference ID (.*?)\n\n(\d+)', text)
    reference_id = reference_id_pattern.group(1) + reference_id_pattern.group(2) if reference_id_pattern else None

    return {
        "Status": status,
        "Position": position,
        "Ticker": ticker,
        "Amount (THB)": thb_amount,
        "Submission Date": submission_date,
        "Payment Date": payment_date,
        "Effective Date": effective_date,
        "Dime! Portfolio": portfolio,
        "Unitholder No.": unitholder_no,
        "Account No.": account_no,
        "Order ID": order_id,
        "Reference ID": reference_id
    }


In [None]:
folder_path = repo_dir + "/data/private/receipt/dime/mutual_funds"

data_list = [] # list of dict

# Loop through all files in the folder
for filename in sorted(os.listdir(folder_path)):
    file_formats = [".PNG", ".JPEG", ".JPG"]

    if any(ext in filename.upper() for ext in file_formats):
        file_path = os.path.join(folder_path, filename)
        image = Image.open(file_path)

        # Perform OCR
        text = pytesseract.image_to_string(image, lang="eng")

        # Extract data from text
        data = extract_data_from_text(text)

        # Append data to the list
        data_list.append(data)

# Create DataFrame from the list
df = pd.DataFrame(data_list)

In [None]:
# DataType

# DateTieme
submission_24h = pd.to_datetime(df["Submission Date"], errors="coerce", format="%d %b %Y - %H:%M")
submission_ampm = pd.to_datetime(df["Submission Date"], errors="coerce", format="%d %b %Y - %I:%M %p")
df["Submission Date"] = submission_24h.combine_first(submission_ampm)

payment_24h = pd.to_datetime(df["Payment Date"], errors="coerce", format="%d %b %Y - %H:%M")
payment_ampm = pd.to_datetime(df["Payment Date"], errors="coerce", format="%d %b %Y - %I:%M %p")
df["Payment Date"] = payment_24h.combine_first(payment_ampm)

df["Effective Date"] = pd.to_datetime(df["Effective Date"], dayfirst=True, format="%d %b %Y", errors="coerce")

# Amount # float
df['Amount (THB)'] = df['Amount (THB)'].str.replace(',', '').astype(float)

df.info()

In [None]:
# df.to_csv(repo_dir + "/data/private/receipt/dime/mutual_funds_ocr.csv", index=False)