In [27]:
import os
import base64
from dotenv import load_dotenv
from openai import OpenAI
from pydantic import BaseModel, Field
from enum import Enum

load_dotenv()


client = OpenAI(
    base_url="https://api.fireworks.ai/inference/v1",
    api_key=os.getenv("FIREWORKS_API_KEY")
)


def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [33]:
class DocumentType(Enum):
    PASSPORT = "passport"
    DRIVERS_LICENSE = "drivers_license"
    OTHER_VALID_DOCUMENT = "other_valid_document"
    INDECIPHERABLE = "indecipherable"


class DocumentClassification(BaseModel):
    image_description: str = Field(..., description="A 1 sentence description of the image.")
    image_depicts_identification_document: bool = Field(..., description="True if the image depicts an identification document, false otherwise.")
    document_type: DocumentType = Field(..., description="The type of document depicted in the image.")


def identify_document(image_path: str):
    base64_image = encode_image(image_path)
    completion = client.beta.chat.completions.parse(
        model="accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
        response_format={"type": "json_object", "schema": DocumentClassification.model_json_schema()},
        messages=[
            {"role": "system", "content": "Carefully look at the provided image and determine if it depicts a valid identification document. If it does, please specify the type of document. If it does not, please specify 'indecipherable'."},
            {"role": "user", "content": [
                {"type": "text", "text": "Classify the following image of a document."},
                {
                    "type": "image_url",
                    "image_url": {
                        "url":  f"data:image/jpeg;base64,{base64_image}"
                    },
                },
            ],}
        ]
    )

    return completion.choices[0].message


print(identify_document("test_images/test-license.png"))


ParsedChatCompletionMessage[NoneType](content='{"image_description": "A red, white, blue, and yellow rectangle in the top-left corner contains the words \'Pennsylvania\' and \'via visitPA.com USA\' in a stylized font. Below this, a white rectangle features a headshot of a woman with long brown hair. The left side of the image contains various pieces of information, including her date of birth, expiration date, and name. On the right side, the birthplace and sex are listed.","image_depicts_identification_document": true,"document_type": "drivers_license"}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=[], parsed=None)
