In [2]:
from utils import read_jsonl, measure_accuracy
from oumi.core.types.conversation import Conversation, Message, Role

test_file = '/Users/ryanarman/code/lab/banking77/notebooks/data/banking77_test_v2_1.jsonl'
test_data = read_jsonl(test_file)
test_data[0]

def create_conversations(data):
    # Create conversations from test_data
    # Each item in test_data has messages with system and user roles
    openai_conversations = []
    for item in data:
        messages = item['messages']
        # Extract system prompt and user query
        system_content = messages[0]['content']  # System prompt
        user_content = messages[1]['content']     # User query
        
        # Create Conversation object with system and user messages
        conversation = Conversation(
            messages=[
                Message(role=Role.SYSTEM, content=system_content),
                Message(role=Role.USER, content=user_content),
            ]
        )
        openai_conversations.append(conversation)
    return openai_conversations


In [None]:
# Run inference with OpenAI GPT-4o-mini on the test set
# Uses the same system prompt as the local model
from oumi.core.configs import InferenceConfig, ModelParams, GenerationParams
from oumi.core.configs.inference_engine_type import InferenceEngineType
from oumi.builders.inference_engines import build_inference_engine
import os

# Check if OpenAI API key is set
if not os.getenv("OPENAI_API_KEY"):
    print("⚠️  WARNING: OPENAI_API_KEY environment variable not set!")
    print("   Set it with: export OPENAI_API_KEY=your_key_here")
else:
    print("✓ OPENAI_API_KEY is set")

# Create inference config for OpenAI GPT-4o-mini
from oumi.core.configs.params.remote_params import RemoteParams

openai_config = InferenceConfig(
    model=ModelParams(
        model_name="gpt-5-mini",  # OpenAI model name (fixed from gpt-5-mini)
    ),
    generation=GenerationParams(
        max_new_tokens=2048,  # Increased to avoid truncation (model may add reasoning despite prompt)
        temperature=1.0,  # GPT-4o-mini only supports 1.0 (default), cannot use 0.0
        use_sampling=True,  # Sampling is required when temperature > 0
    ),
    engine=InferenceEngineType.OPENAI,
    remote_params=RemoteParams(
        num_workers=1000,  # Set to requests per minute (RPM) limit
        politeness_policy=60.0,  # 60 seconds (most APIs have per-minute limits)
        use_adaptive_concurrency=True,  # Automatically adjust concurrency based on error rate
    ),
)

print("OpenAI inference config created")

# Build the OpenAI inference engine
openai_engine = build_inference_engine(
    engine_type=openai_config.engine or InferenceEngineType.OPENAI,
    model_params=openai_config.model,
    remote_params=openai_config.remote_params,
)


openai_conversations = create_conversations(test_data)
print(f"Created {len(openai_conversations)} conversations from test_data")

# Run inference with OpenAI
print(f"Running OpenAI inference on {len(openai_conversations)} conversations...")
openai_responses = openai_engine.infer(
    input=openai_conversations,
    inference_config=openai_config,
)

print(f"Received {len(openai_responses)} responses from OpenAI")



✓ OPENAI_API_KEY is set
OpenAI inference config created
Created 3080 conversations from test_data
Running OpenAI inference on 3080 conversations...


 97%|█████████▋| 2995/3080 [02:33<02:04,  1.47s/it]  

In [None]:
openai_conversations[0]

SYSTEM: You are a banking intent classifier. Classify the user's query into one of  77 banking intents (output is a single integer ID).

IDs:

0: activate_my_card
1: age_limit
2: apple_pay_or_google_pay
3: atm_support
4: automatic_top_up
5: balance_not_updated_after_bank_transfer
6: balance_not_updated_after_cheque_or_cash_deposit
7: beneficiary_not_allowed
8: cancel_transfer
9: card_about_to_expire
10: card_acceptance
11: card_arrival
12: card_delivery_estimate
13: card_linking
14: card_not_working
15: card_payment_fee_charged
16: card_payment_not_recognised
17: card_payment_wrong_exchange_rate
18: card_swallowed
19: cash_withdrawal_charge
20: cash_withdrawal_not_recognised
21: change_pin
22: compromised_card
23: contactless_not_working
24: country_support
25: declined_card_payment
26: declined_cash_withdrawal
27: declined_transfer
28: direct_debit_payment_not_recognised
29: disposable_card_limits
30: edit_personal_details
31: exchange_charge
32: exchange_rate
33: exchange_via_app
34:

In [None]:
teacher_results_path = '/Users/ryanarman/code/lab/banking77/notebooks/data/teacher_results_v2_1_sytem_prompt.jsonl'
import json

# Save responses as conversation format
with open(teacher_results_path, "w", encoding='utf-8') as f:
    for response in openai_responses:
        # Convert Conversation object to dict and save
        f.write(json.dumps(response.to_dict(), ensure_ascii=False) + "\n")

print(f"Saved {len(openai_responses)} conversations to {teacher_results_path}")

Saved 3080 conversations to /Users/ryanarman/code/lab/banking77/notebooks/data/teacher_results_v3_sytem_prompt.jsonl


In [5]:
# Combine responses with metadata for accuracy measurement
inference_data = []
for response, original_item in zip(openai_responses, test_data):
    response_dict = response.to_dict()
    inference_data.append({
        'messages': response_dict['messages'],
        'metadata': original_item['metadata']
    })

accuracy, correct, total, errors, incorrect_list = measure_accuracy(inference_data)


Accuracy: 83.08% (2559/3080)


## Basic prompt

In [21]:
test_basic_file = '/Users/ryanarman/code/lab/banking77/notebooks/data/banking77_test_basic.jsonl'
test_basic_data = read_jsonl(test_basic_file)

openai_conversations_basic = create_conversations(test_basic_data)
print(f"Created {len(openai_conversations_basic)} conversations from test_data")

# Run inference with OpenAI
print(f"Running OpenAI inference on {len(openai_conversations_basic)} conversations...")
openai_responses_basic = openai_engine.infer(
    input=openai_conversations_basic,
    inference_config=openai_config,
)

print(f"Received {len(openai_responses_basic)} responses from OpenAI")


Created 3080 conversations from test_data
Running OpenAI inference on 3080 conversations...


100%|██████████| 3080/3080 [01:21<00:00, 37.99it/s]  


Received 3080 responses from OpenAI


In [25]:
teacher_results_path_basic = '/Users/ryanarman/code/lab/banking77/notebooks/data/teacher_results_v2_basic_prompt.jsonl'
# Save responses as conversation format
with open(teacher_results_path_basic, "w", encoding='utf-8') as f:
    for response in openai_responses_basic:
        # Convert Conversation object to dict and save
        f.write(json.dumps(response.to_dict(), ensure_ascii=False) + "\n")


In [27]:
# Combine responses with metadata for accuracy measurement
inference_data_basic = []
for response, original_item in zip(openai_responses_basic, test_basic_data):
    response_dict = response.to_dict()
    inference_data_basic.append({
        'messages': response_dict['messages'],
        'metadata': original_item['metadata']
    })

accuracy_basic, correct_basic, total_basic, errors_basic, incorrect_list_basic = measure_accuracy(inference_data_basic)


Accuracy: 80.16% (2469/3080)
