### import

In [None]:
import pandas as pd
import numpy as np
import os
# OCR
import pytesseract
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # Path to tesseract executable  # For Apple Silicon Macs
from PIL import Image
import re # Regular expressions
from tqdm import tqdm

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.llms import Ollama
from pydantic import BaseModel, Field, ConfigDict
from typing import Optional

# import ollama

In [None]:
# --- Path --- #
original_dir = os.getcwd()
os.chdir('../') # go back 1 parent directory
repo_dir = os.getcwd()
# print("Repository path:", repo_dir) # ../Investment-Portfolio
os.chdir(original_dir)

# folder_path = repo_dir + "/data/private/receipt/dime/mutual_funds"
# folder_path = repo_dir + "/data/private/receipt/dime/us_stock"
folder_path = repo_dir + "/data/private/receipt/dime/us_stock_test"

In [None]:
# OCR - Optical Character Recognition
text_data = [] # list of str

for filename in sorted(os.listdir(folder_path)):
    file_formats = [".PNG", ".JPEG", ".JPG"]
    if any(ext in filename.upper() for ext in file_formats):
        file_path = os.path.join(folder_path, filename)
        # Load image
        image = Image.open(file_path)

        # Perform OCR
        text = pytesseract.image_to_string(image, lang="eng")

        text_data.append(text)


In [None]:
for t in text_data:
    print(t)
    print("----")

In [None]:
# # ollama-python
# from pydantic import BaseModel, Field, ValidationError
# from typing import Optional

# class ExtractedInformation(BaseModel):
#     Status: str
#     Submission_Date: str = Field(..., alias='Submission Date')
#     Payment_Date: str = Field(..., alias='Payment Date')
#     Effective_Date: str = Field(..., alias='Effective Date')

# import json

# def parse_response(response):
#     """Parse the response from the LLaMA model and validate using pydantic."""
#     content = response['message']['content']
#     extracted_info_dict = json.loads(content)

#     try:
#         validated_data = ExtractedInformation(**extracted_info_dict)
#         return validated_data
#     except ValidationError as e:
#         print("Validation error:", e)
#         return None


# for index, row in df_text.iterrows():
#     prompt = row["text"] + " summarize information from above text and return just only a 1-level dictionary startwith { and end with } ,dont include any other text"
#     response = ollama.chat(model='llama3.1', messages=[
#       {
#         'role': 'user',
#         'content': prompt,
#       },
#     ])
#     validated_data = parse_response(response)
#     if validated_data:
#         print(validated_data.json())


In [None]:
model = Ollama(model="llama3.1")

class DimeMutualFund(BaseModel):
    model_config = ConfigDict(extra="ignore")

    Status: Optional[str] = Field(
        description="Status of the transaction", examples=["Complete", "Matched"]
    )
    Position: Optional[str] = Field(\
        description="Buy or Sell", examples=["Buy", "Sell"], min_length=3, max_length=4
    )
    Ticker: Optional[str] = Field(
        description="Mutual Fund Ticker", examples=["K-CHINA-A(D)", "SCBCHEQA"]
    )
    NAV_Unit: Optional[float] = Field(
        description="Net Asset Value(NAV) per unit", examples=[4.8332]
    )
    Units: Optional[float] = Field(
        description="Number of units bought or sold", examples=['206.9022 Units']
    )
    THB_Amount: Optional[float] = Field(
        description="Total amount in THB", examples=[1000.00]
    )
    Submission_Date: Optional[str] = Field(
        alias="Submission Date"
    )
    Payment_Date: Optional[str] = Field(
        alias="Payment Date"
    )
    Effective_Date: Optional[str] = Field(
        alias="Effective Date"
    )
    DimePortfolio: Optional[str] = Field(
        alias="Dime! Portfolio", examples=["Mutual Fund Port"]
    )
    Unitholder_No: Optional[str] = Field(
        alias="Unitholder No.", examples=["530019364771"]
    )
    Account_No: Optional[str] = Field(
        alias="Account No.", examples=["101710737908447"]
    )
    Order_ID: Optional[str] = Field(
        alias="Order ID", examples=["2202405270009583"]
    )


class DimeUSStock(BaseModel):
    model_config = ConfigDict(extra="ignore")

    Status: Optional[str] = Field(
        description="Status of the order", examples=["Complete", "Matched"]
    )
    Position: Optional[str] = Field(\
        description="Buy or Sell", examples=["Buy", "Sell"], min_length=3, max_length=4
    )
    Ticker: Optional[str] = Field(
        description="Stock Ticker", examples=["NVDA", "BABA"]
    )
    Market: Optional[str] = Field(
        description="Market", examples=["NASDAQ", "NYSE"]
    )
    Exeuted_Price: Optional[str] = Field(
        alias="Executed Price", description="Executed price in THB or USD", examples=["902.50", "26.82 USD"]
    )
    Shares: Optional[str] = Field(
        alias="Shares", description="Number of shares bought or sold", examples=["0.0303711", "0.3318159 Shares"]
    )
    Stock_Amount: Optional[str] = Field(
        alias="Stock Amount", description="Amount of stock in THB or USD", examples=["999.92 THB", "26.88 USD"]
    )
    Commission_Fee: Optional[str] = Field(
        alias="Commission Fee", description="Commission Fee in THB or USD", examples=["0.00 THB", "-0.04 USD"]
    )
    VAT: Optional[str] = Field(
        alias="VAT 7%", description="VAT 7% in THB or USD", examples=["0.00 THB", "-0.0026 USD"]
    )
    Submission_Date: Optional[str] = Field(
        alias="Submission Date"
    )
    Completion_Date: Optional[str] = Field(
        alias="Completion Date"
    )


In [None]:
query = ""

parser = JsonOutputParser(pydantic_object=DimeUSStock)

prompt = PromptTemplate(
    template="Extract information and answer in JSON format likes.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

In [None]:
extracted_data = []

for text in tqdm(text_data):
    query = text

    # run the chain
    result = chain.invoke({"query": query})

    extracted_data.append(result)

In [None]:
df = pd.DataFrame(extracted_data)
df

In [None]:
# DataType

# DateTieme
submission_24h = pd.to_datetime(df["Submission Date"], errors="coerce", format="%d %b %Y - %H:%M")
submission_ampm = pd.to_datetime(df["Submission Date"], errors="coerce", format="%d %b %Y - %I:%M %p")
df["Submission Date"] = submission_24h.combine_first(submission_ampm)

df['Completion Date'] = pd.to_datetime(df['Completion Date'], errors='coerce', format='%d %b %Y - %H:%M')

df

In [None]:
# df.to_csv("dime_us_stock.csv", index=False)