In [83]:
addresses = [
    {"place": "White House", "address": "1600 Pennsylvania Avenue, Washington DC"},
    {"place": "NYSE", "address": "11 Wall Street New York, NY"},
    {"place": "Empire State Building", "address": "350 Fifth Avenue New York, NY 10118"},
    {"place": "Hollywood sign", "address": "4059 Mt Lee Dr. Hollywood, CA 90068"},
    {"place": "Statue of Liberty", "address": "Statue of Liberty, Liberty Island New York, NY 10004"},
    {"place": "Fatehpur Sikri", "address": "Fatehpur Sikri, UP 283110, Agra"}
]

In [84]:
import httpx
from copy import deepcopy
from dotenv import load_dotenv
import os
import json
from tqdm import tqdm


In [77]:
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
print("OPENAI_API_KEY:", OPENAI_API_KEY)


OPENAI_API_KEY: None


In [85]:
import json
import requests

def get_address(address):
    url = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }
    data = {
        "model": "gpt-4o-mini",
        "response_format": { "type": "json_object" },
        "messages": [
            {
                "role": "system",
                "content": """
Extract the state name, ZIP code and country as JSON.
Use {"state_name": ..., "zip_code": ..., "country": 3-letter country code}
"""
            },
            {
                "role": "user",
                "content": address
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    result = response.json()
    return json.loads(result["choices"][0]["message"]["content"])

get_address(addresses[0]['address'])



{'state_name': 'Washington DC', 'zip_code': '20500', 'country': 'USA'}

In [86]:
# deepcopy the addreses
addr = deepcopy(addresses)
for item in tqdm(addr):
    item.update(get_address(item["address"]))

100%|██████████| 6/6 [00:29<00:00,  4.83s/it]


In [None]:
import pandas as pd
df = pd.DataFrame(addr)


Unnamed: 0,place,address,state_name,zip_code,country
0,White House,"1600 Pennsylvania Avenue, Washington DC",Washington DC,20500,USA
1,NYSE,"11 Wall Street New York, NY",New York,10005,USA
2,Empire State Building,"350 Fifth Avenue New York, NY 10118",New York,10118,USA
3,Hollywood sign,"4059 Mt Lee Dr. Hollywood, CA 90068",California,90068,USA
4,Statue of Liberty,"Statue of Liberty, Liberty Island New York, NY...",New York,10004,USA
5,Fatehpur Sikri,"Fatehpur Sikri, UP 283110, Agra",Uttar Pradesh,283110,IND


In [92]:
# Target output
# {
#     "state": {"name": "Washington DC", "code": "DC"},
#     "country": {"name": "India", "code": "IND"},
#     "zip": {"code": "..."}
# }

schema =  {
  "type": "object",
  "properties": {
    "state": {
      "type": "object",
      "description": "Details about the state",
      "properties": {
        "name": {
          "type": "string",
          "description": "Official state name"
        },
        "code": {
          "type": "string",
          "description": "Official state code"
        }
      },
      "required": ["name", "code"]
    },
    "country": {
      "type": "object",
      "description": "Details about the country",
      "properties": {
        "name": {
          "type": "string",
          "description": "Official country name"
        },
        "code": {
          "type": "string",
          "description": "3-letter country code"
        }
      },
      "required": ["name", "code"]
    },
    "zip": {
      "type": "object",
      "description": "Details about the ZIP code",
      "properties": {
        "code": {
          "type": "string",
          "description": "ZIP code"
        }
      },
      "required": ["code"]
    }
  },
  "required": ["state", "country", "zip"]
}

In [94]:
def get_address_schema(address):
    url = "https://aiproxy.sanand.workers.dev/openai/v1/chat/completions"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }
    data = {
        "model": "gpt-4o-mini",
        "response_format": { "type": "json_object" },
        "tools": [
          {"type": "function", "function": {"name": "extract_address", "description": "Extract address details", "parameters": schema}}
        ],
        "tool_choice": {"type": "function", "function": {"name": "extract_address"}},
        "messages": [
            {
                "role": "system",
                "content": "Get address as JSON via extract_address. If unsure, leave fields blank."
            },
            {
                "role": "user",
                "content": address
            }
        ]
    }

    response = requests.post(url, headers=headers, json=data)
    result = response.json()
    return json.loads(result["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"])

get_address_schema(addresses[0]['address'])

{'state': {'name': 'District of Columbia', 'code': 'DC'},
 'country': {'name': 'United States', 'code': 'USA'},
 'zip': {'code': ''}}

In [95]:
# another usage : 
get_address_schema("4059 Mt Lee Dr. Hollywood, CA 90068")

{'state': {'name': 'California', 'code': 'CA'},
 'country': {'name': 'United States', 'code': 'USA'},
 'zip': {'code': '90068'}}