In [1]:
from __future__ import annotations
import os
from typing import List, Optional, Dict, Any, Protocol
from llm.LLMParser import get_default_parser
import json
import openai
from scripts.DataHandler import DataHandler

In [2]:
# ---------- Backend Interface ----------
class ChatBackend(Protocol):
    def start(self,
              system_instruction: Optional[str] = None,
              history: Optional[List[Dict[str, Any]]] = None) -> None: ...
    def send(self, message: str) -> str: ...
    def history(self) -> List[Dict[str, Any]]: ...
    def reset(self) -> None: ...

In [None]:
# ---------- Azure OpenAI backend (azure-openai) ----------

class AzureOpenAIBackend(ChatBackend):
    """
    Requires: pip install openai>=1.0.0
    Env:
        AZURE_OPENAI_API_KEY       - API key for Azure OpenAI
        AZURE_OPENAI_ENDPOINT      - Your Azure endpoint (e.g. https://myresource.openai.azure.com/)
        AZURE_OPENAI_DEPLOYMENT    - Deployment name (e.g. "testdelaycategory")
        AZURE_OPENAI_API_VERSION   - Optional, defaults to "2024-12-01-preview"
    """

    def __init__(self,
                 api_key: Optional[str] = None,
                 endpoint: Optional[str] = None,
                 deployment: Optional[str] = None,
                 api_version: Optional[str] = None,
                 generation_config: Optional[Dict[str, Any]] = None):
        import os
        from openai import AzureOpenAI

        # --- Get configuration from environment or args ---
        self._api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AZURE_GPT_35_TURBO_API_KEY")
        self._endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
        self._deployment = deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT", "testdelaycategory")
        self._api_version = api_version or os.getenv("AZURE_OPENAI_API_VERSION", "2024-12-01-preview")

        if not self._api_key or not self._endpoint:
            raise ValueError("Missing required Azure OpenAI configuration: API key or endpoint.")


        self._client = AzureOpenAI(
        api_version=self._api_version,
        azure_endpoint=self._endpoint,
        api_key=self._api_key
        )


        # --- Other settings ---
        self._generation_config = generation_config or {
            "max_tokens": 500,
            "temperature": 0.7,
            "top_p": 0.3
        }
        self._chat_history: List[Dict[str, Any]] = []
        self._system_instruction: Optional[str] = None

    # ---------- Session Management ----------

    def start(self,
              system_instruction: Optional[str] = None,
              history: Optional[List[Dict[str, Any]]] = None) -> None:
        """Initialize or reset a chat session with optional system instruction and history."""
        self._system_instruction = system_instruction
        self._chat_history = []

        if system_instruction:
            self._chat_history.append({"role": "system", "content": system_instruction})

        if history:
            self._chat_history.extend(history)

    def send(self, message: str) -> str:
        """Send a user message to Azure OpenAI and get the model's response."""
        if not self._chat_history:
            self.start()

        # Append user message
        self._chat_history.append({"role": "user", "content": message})

        # Generate response
        response = self._client.chat.completions.create(
            model=self._deployment,
            messages=self._chat_history,
            **self._generation_config
        )

        reply = response.choices[0].message.content
        # Append assistant response to history
        self._chat_history.append({"role": "assistant", "content": reply})
        return reply

    def history(self) -> List[Dict[str, Any]]:
        """Return the full chat history."""
        return self._chat_history

    def reset(self) -> None:
        """Clear chat history and reset context."""
        self._chat_history = []
        self._system_instruction = None


In [4]:
def make_backend(provider: Optional[str] = None,
                 *,
                 deployment: Optional[str] = None,
                 endpoint: Optional[str] = None,
                 api_key: Optional[str] = None,
                 **kwargs) -> ChatBackend:
    """
    provider: 'gemini' | 'openai' | 'anthropic' (defaults to 'gemini')
    model_name: overrides provider default
    Also reads env: LLM_PROVIDER, LLM_MODEL
    """
    p = (provider or os.getenv("LLM_PROVIDER") or "gemini").lower()
    deployment = deployment or os.getenv("AZURE_OPENAI_DEPLOYMENT", "testdelaycategory")
    endpoint = endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
    api_key = api_key or os.getenv("AZURE_OPENAI_API_KEY") or os.getenv("AZURE_GPT_35_TURBO_API_KEY")

    if p == "gemini":
        return GeminiBackend(model_name=os.getenv("GEMINI_MODEL", "gemini-2.5-flash"), **kwargs)

    if p in ("openai", "azure", "azureopenai"):
        # Use the AzureOpenAIBackend we defined
        return AzureOpenAIBackend(
            api_key=api_key,
            endpoint=endpoint,
            deployment=deployment,
            **kwargs
        )

    if p == "anthropic":
        return AnthropicBackend(model_name=os.getenv("ANTHROPIC_MODEL", "claude-3-5-sonnet-latest"), **kwargs)

    raise ValueError(f"Unsupported provider: {p}")

In [5]:
openai_deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_api_key = os.getenv("AZURE_OPENAI_API_KEY")

In [6]:
print('\n------------------------ LLM Examples ------------------------')
print("\n------------------------ Example 1: Environmental & Health Risks ------------------------")
example_query = "Are there asbestos filings or air quality complaints near 45-10 21st Street in Queens?"
llm_backend = make_backend(provider="openai",deployment=openai_deployment_name,endpoint=azure_endpoint,api_key=azure_openai_api_key)
parser = get_default_parser(backend=llm_backend)
result = parser.route_query_to_datasets(example_query)
print("\nQuery:", example_query)
print("Router Result:", json.dumps(result, indent=2))


------------------------ LLM Examples ------------------------

------------------------ Example 1: Environmental & Health Risks ------------------------

Query: Are there asbestos filings or air quality complaints near 45-10 21st Street in Queens?
Router Result: {
  "categories": [
    "Construction & Permitting",
    "Environmental & Health Risks"
  ],
  "confidence": 0.85,
  "address": [
    {
      "house_number": "45-10",
      "street_name": "21st Street",
      "borough": "Queens",
      "raw": "45-10 21st Street",
      "notes": ""
    }
  ],
  "dataset_names": [
    "Asbestos Control Program",
    "Clean Air Tracking System (CATS)",
    "DOB Job filings"
  ]
}


In [7]:
handler = DataHandler(result["dataset_names"])
first_dataset = getattr(handler, "d1")
if first_dataset:
    print("\nFirst Dataset:", first_dataset.name)
    print("Description:", first_dataset.description, '\n')
    print(first_dataset.df.shape)
    display(first_dataset.df.head())

second_dataset = getattr(handler, 'd2')
if second_dataset:
    print("\nSecond Dataset:", second_dataset.name)
    print("Description:", second_dataset.description, '\n')
    print(second_dataset.df.shape)
    display(second_dataset.df.head())

print("\n------------------------ Example 2: Comparative Site Queries ------------------------")
example_query = 'Which location has fewer open permits: Jamaica Avenue in Queens or Broadway in Upper Manhattan?”'
result = parser.route_query_to_datasets(example_query)
print("\nQuery:", example_query)
print("Router Result:", json.dumps(result, indent=2))

handler = DataHandler(result["dataset_names"])
first_dataset = getattr(handler, "d1")
if first_dataset:
    print("\nFirst Dataset:", first_dataset.name)
    print("Description:", first_dataset.description)

second_dataset = getattr(handler, 'd2')
if second_dataset:
    print("\nSecond Dataset:", second_dataset.name)
    print("Description:", second_dataset.description)




First Dataset: Asbestos Control Program
Description: ACP7 form is an asbestos project notification. Any time asbestos abatement is perform on quantities greater than a minor project amount, the applicant has to file this form with DEP Asbestos Control Program (ACP). All asbestos documents are filed through the Asbestos Reporting and Tracking System (ARTS) E-file system. This system is web based and entirely paperless. All information on the ACP7 is essential to meet the requirements setforth in the asbestos rules and regulations Title15, Chapter 1 (RCNY). ACP enforcement staff utilizes this form for inspection of asbestos abatement activities. 

(1000, 35)


Unnamed: 0,tru,start_date,end_date,status_description,street_activity,premise_no,street_name,borough,zip_code,facility_type,...,community_board,council_district,census_tract,bbl,nta,facility_aka,section,cross_street_on,cross_street_between,cross_street_and
0,TRU1305BX22,2022-10-31T00:00:00.000,2023-09-20T00:00:00.000,Closed,No,250,West Fordham Rd,Bronx,10468,Commercial,...,7,14,257,2032340080,University Heights-Morris Heights,,,,,
1,TRU1455MN20,2020-10-09T00:00:00.000,2021-08-25T00:00:00.000,Closed,No,880,EAST 6TH STREET,Manhattan,10009,Residence,...,3,2,20,1003560001,Lower East Side,GR1700529 LILLIAN WALD BUILDING # 7,EXTERIOR FACADES,,,
2,TRU2264MN25,2025-10-01T00:00:00.000,2026-09-23T00:00:00.000,Postponed,No,253,WEST 24TH STREET,Manhattan,10011,Commercial,...,4,3,91,1007740008,Hudson Yards-Chelsea-Flatiron-Union Square,,Bathroom,,,
3,TRU2158BK24,2025-04-16T00:00:00.000,2025-10-23T00:00:00.000,Submitted,No,93,Lewis Avenue,Brooklyn,11206,Residence,...,3,36,287,3015950001,Stuyvesant Heights,"NYCHA 2413061 Roosevelt II, Bld. #7","Exterior Facade - Elevations #1,2,3,4,7,8",,,
4,TRU0252QN25,2025-06-27T00:00:00.000,2026-03-27T00:00:00.000,Submitted,No,28-01,41st Ave,Queens,11101,School,...,1,26,33,4004040001,Queensbridge-Ravenswood-Long Island City,Newcomers HS / Q450,1922 A.B - Room 404,,,





Second Dataset: Clean Air Tracking System (CATS)
Description:  

(1000, 21)


Unnamed: 0,requestid,applicationid,requesttype,house,street,borough,bin,block,lot,ownername,...,make,model,burnermake,burnermodel,primaryfuel,secondaryfuel,quantity,issuedate,status,premisename
0,54696,CA059089,REGISTRATION,142,FULTON STREET,MANHATTAN,1001224,79,26,CENTURY REALTY INC.,...,HYDROTHERM MR600,HYDROTHERM MR600,INTEGRAL,INTEGRAL,NATURALGAS,NONE,1,1989-02-17T00:00:00.000,EXPIRED,
1,378827,CA022047,CERTIFICATE TO OPERATE,70,,MANHATTAN,3000000,80,837,64-36 REALTY ASSOC.,...,KEWANEE-538,KEWANEE-538,PETRO WDGAH,PETRO WDGAH,NO6FUEL,NONE,1,2022-11-28T15:27:17.000,CANCELLED,64-36 REALTY ASSOC.
2,394994,CR081417,REGISTRATION,1581,FULTON AVENUE,BRONX,2009652,2929,31,CROTONA PARK WEST HDFC,...,LAARS,PNCH-1500,LAARS,PNCH-1500,NATURALGAS,NONE,1,2023-06-02T14:53:47.000,CURRENT,CROTONA PARK WEST HDFC
3,118102,CA026270,CERTIFICATE TO OPERATE,2419,DAVIDSON AVENUE,BRONX,2014310,3199,31,EVROPA REALTY LLC,...,FEDERAL FST 70 SP,FEDERAL FST 70 SP,RAY AEOR 144 SIZE 70,RAY AEOR 144 SIZE 70,NO4FUEL,NONE,1,1900-01-01T00:00:00.000,CANCELLED,
4,456474,CA062298,REGISTRATION,19-47,80 STREET,QUEENS,4444701,949,7501,GARDEN BAY MANOR CONDO.,...,WEIL MCLAIN,778,POWERFLAME,WJR-30A-10,NATURALGAS,NONE,1,2025-03-07T11:24:57.000,CURRENT,GARDEN BAY MANOR CONDOMINIMUM



------------------------ Example 2: Comparative Site Queries ------------------------

Query: Which location has fewer open permits: Jamaica Avenue in Queens or Broadway in Upper Manhattan?”
Router Result: {
  "categories": [
    "Comparative Site Queries",
    "Construction & Permitting"
  ],
  "confidence": 0.87,
  "address": [
    {
      "house_number": "",
      "street_name": "Jamaica Avenue",
      "borough": "Queens",
      "raw": "Jamaica Avenue in Queens",
      "notes": ""
    },
    {
      "house_number": "",
      "street_name": "Broadway",
      "borough": "Manhattan",
      "raw": "Broadway in Upper Manhattan",
      "notes": "Upper Manhattan"
    }
  ],
  "dataset_names": [
    "DOB permits",
    "Street Construction Permits"
  ]
}

First Dataset: DOB permits
Description: 

Second Dataset: Street Construction Permits
Description: 
