In [None]:
from dotenv import load_dotenv

load_dotenv("../.env")

In [2]:
from langchain_community.document_loaders.llmsherpa import LLMSherpaFileLoader

In [3]:
FILE_PATH = "../documents/pdf-contoh-hasil-slik-ojk_compress.pdf"

In [4]:
loader = LLMSherpaFileLoader(
    file_path=FILE_PATH,
    new_indent_parser=True,
    apply_ocr=True,
    strategy="text",
    llmsherpa_api_url="http://localhost:5010/api/parseDocument?renderFormat=all",
)

In [5]:
docs = loader.load()

In [None]:
print(docs[0].page_content)

In [None]:
len(docs[0].page_content)

[Extraction Long Text](https://python.langchain.com/docs/how_to/extraction_long_text/)

In [14]:
from typing import List, Optional
from pydantic import BaseModel, Field
from datetime import date

class PaymentHistory(BaseModel):
    """Monthly payment history information."""
    month: Optional[str] = Field(default=None, description="Month of payment record")
    quality: Optional[str] = Field(default=None, description="Credit quality for that month (1-5)")
    days_past_due: Optional[int] = Field(default=None, description="Number of days past due")

class CreditFacility(BaseModel):
    """Information about a credit/financing facility."""
    reporter: Optional[str] = Field(default=None, description="Name of the reporting bank")
    branch: Optional[str] = Field(default=None, description="Branch name")
    agreement_number: Optional[str] = Field(default=None, description="Credit agreement number")
    facility_type: Optional[str] = Field(default=None, description="Type of credit facility")
    plafond: Optional[float] = Field(default=None, description="Credit limit in IDR")
    outstanding: Optional[float] = Field(default=None, description="Current outstanding balance in IDR")
    start_date: Optional[date] = Field(default=None, description="Start date of credit")
    due_date: Optional[date] = Field(default=None, description="Due date of credit")
    interest_rate: Optional[float] = Field(default=None, description="Interest rate percentage")
    interest_type: Optional[str] = Field(default=None, description="Type of interest rate")
    usage_type: Optional[str] = Field(default=None, description="Usage type (e.g., Konsumsi)")
    quality: Optional[str] = Field(default=None, description="Current credit quality")
    days_past_due: Optional[int] = Field(default=None, description="Current days past due")
    payment_history: List[PaymentHistory] = Field(default_factory=list, description="12-month payment history")

class SLIKReport(BaseModel):
    """SLIK report information."""
    # Report Metadata
    report_number: Optional[str] = Field(default=None, description="SLIK report number (e.g., 41897/IDEB/0101564/2019)")
    report_date: Optional[date] = Field(default=None, description="Date of the SLIK report")
    reference_number: Optional[str] = Field(default=None, description="Reference number (Kode Ref. Pengguna)")
    operator: Optional[str] = Field(default=None, description="Operator name")
    
    # Debtor Information
    debtor_name: Optional[str] = Field(default=None, description="Name of the debtor")
    debtor_id: Optional[str] = Field(default=None, description="ID number/NIK of the debtor")
    gender: Optional[str] = Field(default=None, description="Gender of debtor")
    birth_place: Optional[str] = Field(default=None, description="Place of birth")
    birth_date: Optional[date] = Field(default=None, description="Date of birth")
    address: Optional[str] = Field(default=None, description="Complete address")
    occupation: Optional[str] = Field(default=None, description="Occupation")
    workplace: Optional[str] = Field(default=None, description="Workplace name")
    
    # Credit Summary
    total_plafond: Optional[float] = Field(default=None, description="Total effective plafond across all facilities")
    total_outstanding: Optional[float] = Field(default=None, description="Total outstanding balance")
    worst_quality: Optional[str] = Field(default=None, description="Worst credit quality")
    
    # Facilities
    facilities: List[CreditFacility] = Field(default_factory=list, description="List of credit facilities")

In [9]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.1,
)

In [10]:
messages = [
    (
        "system",
        "You are an experienced Data Analyst with extensive knowledge in SQL and database querying. Convert natural language questions into SQL queries."
    ),
    ("human", "Show me total sales by product category for last month.")
]

In [11]:
result = llm.invoke(messages)

In [None]:
result

In [16]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    (
        "system",
        """You are an expert financial data extraction algorithm specialized in Indonesian SLIK reports.
        Extract all relevant information from the SLIK report text into structured data.
        Follow these guidelines:
        - Convert all monetary values to numbers (remove 'Rp' and ',' separators)
        - Convert percentage values to decimal numbers
        - Format dates as YYYY-MM-DD
        - If a value is not present in the text, return null
        - Maintain relationships between facilities, collateral, and guarantors
        Be precise and accurate in extracting financial data."""
    ),
    ("human", "{text}")
])

# Using the extractor
structured_llm = llm.with_structured_output(schema=SLIKReport)
slik_text = docs[0].page_content  # Your PDF text from earlier

extractor = prompt | structured_llm

In [17]:
response = extractor.invoke({"text": slik_text})

In [None]:
response