In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from google import genai
from google.genai import types
from pathlib import Path

In [3]:
from dotenv import load_dotenv
import os
import json
from pydantic import create_model

In [4]:
client = genai.Client(api_key=os.environ['GEMINI_API_KEY'])

In [5]:
filepath = 'coxbusiness1.pdf'

In [6]:
prompt = """
Understand what this document is about and summarize it using a strict JSON format (key: value). 
value must be a string, except where it makes sense to use nested dicts or lists.
Your output must be a valid JSON.
"""

In [7]:
response = client.models.generate_content(
  model="gemini-2.0-flash",
  contents=[
      types.Part.from_bytes(
        data=Path(filepath).read_bytes(),
        mime_type='application/pdf',
      ),
      prompt
    ]
)


In [8]:
print(response.text)

```json
{
  "documentType": "Cox Business Bill",
  "billDate": "May 13, 2025",
  "accountName": "VILLA NUEVA APARTMENTS",
  "accountNumber": "001 3110 110588801",
  "dueDate": "Jun 2, 2025",
  "totalDue": "$392.94",
  "previousBalance": "$393.36",
  "paymentReceived": "-$393.36",
  "remainingPreviousBalance": "$0.00",
  "newCharges": {
    "period": "May 12, 2025 - Jun 11, 2025",
    "internet": "$272.00",
    "telephone": "$82.80",
    "taxesFeesAndSurcharges": "$38.14",
    "totalNewCharges": "$392.94"
  },
  "serviceAddress": {
    "apt": "APT 215",
    "street": "1901 DEL SUR BLVD",
    "city": "SAN YSIDRO",
    "state": "CA",
    "zip": "92173-1381"
  },
  "contactUs": {
    "website": "www.coxbusiness.com/chat",
    "email": "coxbusiness.com"
  },
  "monthlyServices": {
    "internet": {
      "cbiModem": "$7.00",
      "staticIPAddress": "$10.00",
      "cbi300MbpsX30Mbps": "$255.00",
      "totalInternet": "$272.00"
    },
    "telephone": {
      "6196215268": {
        "direc

In [9]:
clean_text = response.text.replace("```json", "").replace("```", "")

In [33]:
print(clean_text)


{
  "documentType": "Cox Business Bill",
  "billDate": "May 13, 2025",
  "accountName": "VILLA NUEVA APARTMENTS",
  "accountNumber": "001 3110 110588801",
  "dueDate": "Jun 2, 2025",
  "totalDue": "$392.94",
  "previousBalance": "$393.36",
  "paymentReceived": "-$393.36",
  "remainingPreviousBalance": "$0.00",
  "newCharges": {
    "period": "May 12, 2025 - Jun 11, 2025",
    "internet": "$272.00",
    "telephone": "$82.80",
    "taxesFeesAndSurcharges": "$38.14",
    "totalNewCharges": "$392.94"
  },
  "serviceAddress": {
    "apt": "APT 215",
    "street": "1901 DEL SUR BLVD",
    "city": "SAN YSIDRO",
    "state": "CA",
    "zip": "92173-1381"
  },
  "contactUs": {
    "website": "www.coxbusiness.com/chat",
    "email": "coxbusiness.com"
  },
  "monthlyServices": {
    "internet": {
      "cbiModem": "$7.00",
      "staticIPAddress": "$10.00",
      "cbi300MbpsX30Mbps": "$255.00",
      "totalInternet": "$272.00"
    },
    "telephone": {
      "6196215268": {
        "directoryLis

In [11]:
example_dict = json.loads(clean_text)

In [49]:
prompt = f"""
You are given the following sample dict. 
Your task is to create a pydantic BaseModel that has a structure corresponding to that sample dict.
The model should be generic enough to fit other examples.
It should not be specifically tailored to this one sample, but just use it as an example to deduce the structure.

Here is the sample dict:

{example_dict}

For nested fields, you should create smaller BaseModel classes and reuse them in the final one.

You answer should be a valid python definition of a pydantic model. Give the class the name: DynamicModel.

Important: 
- Do not use the `any` type in the field definitions.
- To define root models, use `pydantic.RootModel` rather than a field called '__root__'
- Do not use `fields`
"""

In [50]:
response = client.models.generate_content(
  model="gemini-2.0-flash",
  contents=[
      prompt
    ]
)

In [51]:
print(response.text)

```python
from typing import Dict, Optional, Any, Union
from pydantic import BaseModel, RootModel


class Address(BaseModel):
    apt: Optional[str] = None
    street: str
    city: str
    state: str
    zip: str


class ContactUs(BaseModel):
    website: str
    email: str


class NewCharges(BaseModel):
    period: str
    internet: str
    telephone: str
    taxesFeesAndSurcharges: str
    totalNewCharges: str


class InnerTelephone(BaseModel):
    directoryListingNonPublished: str
    voiceManagerMeasuredLine: str
    voiceManagerEssentialPackage: str


class Internet(BaseModel):
    cbiModem: str
    staticIPAddress: str
    cbi300MbpsX30Mbps: str
    totalInternet: str


class Telephone(RootModel[Dict[str, Union[InnerTelephone, str]]]):
    root: Dict[str, Union[InnerTelephone, str]]


class MonthlyServices(BaseModel):
    internet: Internet
    telephone: Telephone
    totalMonthlyServices: str


class InternetTaxesAndFees(BaseModel):
    countySalesTax: str
    citySalesTax: st

In [52]:
clean_response = response.text.replace("```python", "").replace("```json", "").replace("```", "").lstrip()

In [53]:
with open("dynamic_model.py", "w") as f:
    f.write(clean_response)

In [17]:
import importlib

def dynamic_import(module_name, name):
    module = importlib.import_module(module_name)
    o = getattr(module, name)
    return o

In [54]:
from dynamic_model import DynamicModel

In [55]:
print(DynamicModel.schema_json(indent=2))

{
  "$defs": {
    "Address": {
      "properties": {
        "apt": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "title": "Apt"
        },
        "street": {
          "title": "Street",
          "type": "string"
        },
        "city": {
          "title": "City",
          "type": "string"
        },
        "state": {
          "title": "State",
          "type": "string"
        },
        "zip": {
          "title": "Zip",
          "type": "string"
        }
      },
      "required": [
        "street",
        "city",
        "state",
        "zip"
      ],
      "title": "Address",
      "type": "object"
    },
    "ContactUs": {
      "properties": {
        "website": {
          "title": "Website",
          "type": "string"
        },
        "email": {
          "title": "Email",
          "type": "string"
        }
      }

C:\Users\karim\AppData\Local\Temp\ipykernel_36032\2150826100.py:1: PydanticDeprecatedSince20: The `schema_json` method is deprecated; use `model_json_schema` and json.dumps instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  print(DynamicModel.schema_json(indent=2))


In [62]:
prompt = f"""
Understand what this document is about and summarize it using a strict JSON format (key: value). 
Your output must be a valid JSON that adheres to the following pydantic BaseModel:

{json.dumps(DynamicModel.model_json_schema(), indent=2)}
"""

In [63]:
print(prompt)


Understand what this document is about and summarize it using a strict JSON format (key: value). 
Your output must be a valid JSON that adheres to the following pydantic BaseModel:

{
  "$defs": {
    "Address": {
      "properties": {
        "apt": {
          "anyOf": [
            {
              "type": "string"
            },
            {
              "type": "null"
            }
          ],
          "default": null,
          "title": "Apt"
        },
        "street": {
          "title": "Street",
          "type": "string"
        },
        "city": {
          "title": "City",
          "type": "string"
        },
        "state": {
          "title": "State",
          "type": "string"
        },
        "zip": {
          "title": "Zip",
          "type": "string"
        }
      },
      "required": [
        "street",
        "city",
        "state",
        "zip"
      ],
      "title": "Address",
      "type": "object"
    },
    "ContactUs": {
      "properties":

In [64]:
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=[
        types.Part.from_bytes(
          data=Path(filepath).read_bytes(),
          mime_type='application/pdf',
        ),
        prompt
      ]
    )


In [65]:
print(response.text)

```json
{
  "documentType": "Cox Business Bill",
  "billDate": "May 13, 2025",
  "accountName": "VILLA NUEVA APARTMENTS",
  "accountNumber": "001 3110 110588801",
  "dueDate": "Jun 2, 2025",
  "totalDue": "$392.94",
  "previousBalance": "$393.36",
  "paymentReceived": "-$393.36",
  "remainingPreviousBalance": "$0.00",
  "newCharges": {
    "period": "May 12, 2025 - Jun 11, 2025",
    "internet": "$272.00",
    "telephone": "$82.80",
    "taxesFeesAndSurcharges": "$38.14",
    "totalNewCharges": "$392.94"
  },
  "serviceAddress": {
    "apt": "APT 215",
    "street": "1901 DEL SUR BLVD",
    "city": "SAN YSIDRO",
    "state": "CA",
    "zip": "92173-1381"
  },
  "contactUs": {
    "website": "www.coxbusiness.com/chat",
    "email": "coxbusiness.com"
  },
  "monthlyServices": {
    "internet": {
      "cbiModem": "$7.00",
      "staticIPAddress": "10.00",
      "cbi300MbpsX30Mbps": "255.00",
      "totalInternet": "$272.00"
    },
    "telephone": {
      "619-621-5268": {
        "direc

In [66]:
filepath2 = 'coxbusiness2.pdf'

In [67]:
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=[
        types.Part.from_bytes(
          data=Path(filepath2).read_bytes(),
          mime_type='application/pdf',
        ),
        prompt
      ]
    )


In [None]:
print(response.text)

```json
{
  "documentType": "Cox Business Bill",
  "billDate": "April 26, 2025",
  "accountName": "Bend on Bluebonnet",
  "accountNumber": "001 5711 098188101",
  "dueDate": "May 18, 2025",
  "totalDue": "$1,089.05",
  "previousBalance": "$808.23",
  "paymentReceived": null,
  "remainingPreviousBalance": "$808.23",
  "newCharges": {
    "period": "Apr 26, 2025 - May 25, 2025",
    "internet": null,
    "telephone": "$188.00",
    "taxesFeesAndSurcharges": "$79.46",
    "totalNewCharges": "$280.82"
  },
  "serviceAddress": {
    "apt": null,
    "street": "10221 BLUEBONNET BLVD",
    "city": "BATON ROUGE",
    "state": "LA",
    "zip": "70810-0000"
  },
  "contactUs": {
    "website": "www.coxbusiness.com/chat",
    "email": "coxbusiness.com"
  },
  "monthlyServices": {
    "internet": {
      "cbiModem": null,
      "staticIPAddress": null,
      "cbi300MbpsX30Mbps": null,
      "totalInternet": null
    },
    "telephone": {
      "225-256-2657": {
        "directoryListingNonPublishe

: 