In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_MODEL_NAME"]="gpt-4o"

In [58]:
from pydantic import BaseModel
from typing import List

class DataExtract(BaseModel):
    contractor_name:str
    account_number:int
    contractor_email:str
    contractor_phone:int
    invoice_id:str
    date:str
    invoice_number:str
    invoice_total:float
    item_description:str
    uom: str
    quantity:int
    unit_cost:float



In [59]:
from crewai import Agent

from crewai_tools import DOCXSearchTool, FileReadTool, PDFSearchTool

file_location = './files/invoice_ai/invoice.pdf'

DOCXSearch_tool = DOCXSearchTool(file_path=file_location)
FileRead_tool = FileReadTool(file_path=file_location)
PDFSearch_tool = PDFSearchTool(file_path=file_location)

admin_assistant = Agent(
    role="Administrative Assistant",
    goal="Read and extract data out of invoice",
    backstory="""
    You are a seasoned adminitrative assistant who excels at invoice processing.
    You thoroughly read invoices and accurately extracts data out of invoices with minimal errors.
    """,
    verbose=True,
    allow_delegation=False,
    tools = [
        DOCXSearch_tool,
        FileRead_tool,
        PDFSearch_tool,
    ],
    max_iter = 10,
)

In [66]:
from crewai import Task

class Tasks:
    
    def extract_data(self, agent):
        return Task(
            description="Extract contractor data out of file.",
            agent=admin_assistant,
            expected_output="""
            Your answer extracts the following data but if you can't find specific data, move on and leave comments. The data is following:
            - contractor name: Name of the contractor/company who provides the service/items
            - account number: Account number of the company who received the service/items
            - email address: E-mail address to contact service provider
            - phone number: Phone number to contact service provider
            - invoice_id: Unique ID allocated to invoice for reference purposes
            - date: Date marked on invoice
            - invoice_total: The total expense spent on the invoice
            - item_description: Description of the purchased service/items
            - uom: Unit of Measure (ex. Each, batch, cm, ft)
            - quantity: Quantity of service/items based on unit of measure
            - unit_cost: Cost of each unit of measure
            
            """,
            output_pydantic = DataExtract,
            output_file="./files/invoice_ai/ExtractedData.md",
        )

In [67]:
from crewai import Crew

tasks = Tasks()

task = tasks.extract_data(admin_assistant)

crew = Crew(agents=[admin_assistant], tasks=[task], verbose=2,)

result = crew.kickoff()



[1m[95m [DEBUG]: == Working Agent: Administrative Assistant[00m
[1m[95m [INFO]: == Starting Task: Extract contractor data out of file.[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mI need to extract specific data from the invoice. The invoice is in a PDF format, so I will start by reading the content of the PDF to get an overview of the document.

Action: Read a file's content
Action Input: {}[0m[95m 

Fail to read the file ./files/invoice_ai/invoice.pdf. Error: 'charmap' codec can't decode byte 0x8f in position 145: character maps to <undefined>
[00m
[32;1m[1;3mThought: Since there was an error reading the file directly, I will use the "Search a PDF's content" tool to extract specific information from the PDF. I will start by searching for general information such as the contractor name and invoice ID.

Action: Search a PDF's content
Action Input: {"query": "contractor name", "pdf": "./files/invoice_ai/invoice.pdf"}[0m[95m 

Relevant Content:
Tri-Ma