<a href="https://colab.research.google.com/github/sanimesa/genai/blob/main/text_extraction_kor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
!pip install openai kor markdownify python-dotenv pdfplumber



In [34]:
from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

# LangChain Models
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

# Standard Helpers
import pandas as pd
import requests
import time
import json
from datetime import datetime

# Text Helpers
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# For token counting
from langchain.callbacks import get_openai_callback

def printOutput(output):
    print(json.dumps(output,sort_keys=True, indent=3))

In [35]:
import os
from dotenv import load_dotenv

load_dotenv('/content/drive/MyDrive/Colab_Files/myenv.env')

openai_api_key = os.getenv('OPENAI_API_KEY')


In [36]:
import pdfplumber
def get_pdf_chunks(pdf_file_path):

    try:
        with pdfplumber.open(pdf_file_path) as pdf:
            # for page in pdf.pages:
            page_text = pdf.pages[0].extract_text()
            # print(page_text)

    except Exception as e:
        print('an exception occured', str(e))
        pass

    return page_text

In [37]:
llm = ChatOpenAI(
#     model_name="gpt-3.5-turbo", # Cheaper but less reliable
    model_name="gpt-4",
    temperature=0,
    max_tokens=2000,
    openai_api_key=openai_api_key
)

In [38]:
bill_schema = Object(
    # This what will appear in your output. It's what the fields below will be nested under.
    # It should be the parent of the fields below. Usually it's singular (not plural)
    id="utility_bill",

    # Natural language description about your object
    description="A utility bill",

    # Fields you'd like to capture from a piece of text about your object.
    attributes=[
        Text(
            id="account_number",
            description="The Account Number",
        ),
        Text(
            id="due_date",
            description="Due Date",
        ),
        Text(
            id="statement_date",
            description="Statement Date",
        ),
        Text(
            id="amount_due",
            description="The Amount Due",
        ),
        Text(
            id="service_address",
            description="The service address",
        )
    ],

    # Examples help go a long way with telling the LLM what you need
    examples=[(
        """Account No: 1234567890-1
        ENERGY STATEMENT
        Statement Date: 09/07/2019
        www.pge.com/MyEnergy
        Due Date: 09/28/2019
        Service For: Your Account Summary
        . AmountDueonPreviousStatement $91.57
        SPARKY JOULE
        12345 ENERGY, CT Payment(s)ReceivedSinceLastStatement -91.57
        ... PreviousUnpaidBalance $0.00
        CurrentPG&EElectricDeliveryCharges $55.66
        Questions about your bill? SiliconValleyCleanEnergyElectricGenerationCharges $3.2.48
        .
        Monday-Friday7a.m.-9p.m.
        Saturday8a.m.-6p.m.
        Total Amount Due by 08/28/2019 $88.14
        """, [{"account_number": "1234567890-1"}, {"due_date": "09/28/2019"},
         {"statement_date": "09/07/2019"}, {"amount_due": "88.14"}])
    ]
)

In [39]:
chain = create_extraction_chain(llm, bill_schema)

In [40]:
pdfsample = '/content/drive/MyDrive/Colab_Files/sample_utility_bills/psegbill1.pdf'

text = get_pdf_chunks(pdfsample)
output = chain.run(text=(text))["data"]

printOutput(output)

{
   "utility_bill": [
      {
         "account_number": "6598655404",
         "amount_due": "99.70",
         "due_date": "Sep27,2023",
         "service_address": "2712FORESTHAVENBLVD EDISONTWPNJ08817-6336",
         "statement_date": "September12,2023"
      }
   ]
}
