In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_MODEL_NAME"]="gpt-4o"

In [3]:
from pydantic import BaseModel
from typing import List

class ContractorTable(BaseModel):
    contractor_name:str
    account_number:int
    contractor_email:str
    contractor_phone:int

class InvoiceTable(BaseModel):
    invoice_id:str
    date:str
    invoice_number:str
    invoice_total:float

class LineItemsTable(BaseModel):
    item_description:str
    uom: str
    quantity:int
    unit_cost:float

class DataExtract(BaseModel):
    contractor: List[ContractorTable]
    invoice: List[InvoiceTable]
    lineitems: List[LineItemsTable]


In [7]:
from crewai import Agent

from crewai_tools import DOCXSearchTool, FileReadTool, PDFSearchTool

file_location = './files/invoice_ai/invoice.pdf'

DOCXSearch_tool = DOCXSearchTool(file_path=file_location)
FileRead_tool = FileReadTool(file_path=file_location)
PDFSearch_tool = PDFSearchTool(file_path=file_location)

admin_assistant = Agent(
    role="Administrative Assistant",
    goal="Read and extract data out of invoice",
    backstory="""
    You are a seasoned adminitrative assistant who excels at invoice processing.
    You thoroughly read invoices and accurately extracts data out of invoices with minimal errors.
    """,
    verbose=True,
    allow_delegation=False,
    tools = [
        DOCXSearch_tool,
        FileRead_tool,
        PDFSearch_tool,
    ],
    max_iter = 5,
)

In [41]:
from crewai import Task

class Tasks:
    
    def extract_data(self, agent):
        return Task(
            description="Extract contractor data out of file.",
            agent=admin_assistant,
            expected_output="""
            Your answer extracts the following data but if you can't find specific data, move on and leave comments. The data is following:
            - contractor name: Name of the contractor/company who provides the service/items
            - account number: Account number of the company who received the service/items
            - email address: E-mail address to contact service provider
            - phone number: Phone number to contact service provider
            - invoice_id: Unique ID allocated to invoice for reference purposes
            - date: Date marked on invoice
            - invoice_total: The total expense spent on the invoice
            - item_description: Description of the purchased service/items
            - uom: Unit of Measure (ex. Each, batch, cm, ft)
            - quantity: Quantity of service/items based on unit of measure
            - unit_cost: Cost of each unit of measure
            """,
            output_file="./files/invoice_ai/ExtractedData.md",
        )

'\n3. Line Items\n- item_description: Description of the purchased service/items\n- uom: Unit of Measure (ex. Each, batch, cm, ft)\n- quantity: Quantity of service/items based on unit of measure\n- unit_cost: Cost of each unit of measure\n'

In [42]:
from crewai import Crew

tasks = Tasks()

task = tasks.extract_data(admin_assistant)

crew = Crew(agents=[admin_assistant], tasks=[task], verbose=2,)

result = crew.kickoff()



[1m[95m [DEBUG]: == Working Agent: Administrative Assistant[00m
[1m[95m [INFO]: == Starting Task: Extract contractor data out of file.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mTo extract the necessary contractor data out of the invoice, I need to read the content of the invoice file and then extract each required piece of information. I will start by reading the PDF file's content.

Action: Read a file's content
Action Input: {}[0m[95m 

Fail to read the file ./files/invoice_ai/invoice.pdf. Error: 'charmap' codec can't decode byte 0x8f in position 145: character maps to <undefined>
[00m
[32;1m[1;3mThought:
It seems there was an error in reading the file directly. Instead of reading the entire content, I will perform semantic searches within the PDF to extract the specific pieces of information required.

Action: Search a PDF's content
Action Input: {"query": "contractor name", "pdf": "./files/invoice_ai/invoice.pdf"}[0m[95m 

Relevant Content:
T