# Run inference on validaiton set using the trained model checkpoint

## Run inference with LoRA adapter
  
cd notebooks/scripts
./submit_inference_rsync.sh \
    data/banking77_val_no_label.jsonl \
    configs/4b_instruct_vllm_infer.yaml \
    system_prompt_v2_lora_results \
    output/banking77_qwen3_4b_lora_v2_2882 \
    ryan@exun


In [3]:
from utils import measure_accuracy, read_jsonl

inference_file = '/Users/ryanarman/code/lab/banking77/notebooks/data/system_prompt_v2_lora_results_2972.jsonl'
inference_data = read_jsonl(inference_file)
accuracy, correct, total, errors, incorrect_list = measure_accuracy(inference_data)


Accuracy: 81.21% (1625/2001)


In [4]:
from collections import Counter

wihout_index = [sorted([item[1], item[2]]) for item in incorrect_list]
wihout_index_str = [f'{item[0]}-{item[1]}' for item in wihout_index]
cnt = Counter(wihout_index_str)
most_common_items = cnt.most_common(20)
most_common_pairs = [item[0] for item in most_common_items]

In [5]:
for pair in most_common_pairs:
    pair_split = pair.split('-')
    p1 = (int(pair_split[0]), int(pair_split[1]))
    p2 = (int(pair_split[1]), int(pair_split[0]))

    index_list_of_incorrect_pairs = []
    for item in incorrect_list:
        if item[1:] == p1 or item[1:] == p2:
            # print(item)
            index_list_of_incorrect_pairs.append(item[0])


    print(f"rows with incorrect pairs for {pair}")
    print(index_list_of_incorrect_pairs)
    print("-"*100)

rows with incorrect pairs for 48-66
[50, 201, 231, 251, 281, 686, 1172, 1261, 1515, 1527]
----------------------------------------------------------------------------------------------------
rows with incorrect pairs for 66-67
[129, 145, 480, 725, 818, 981, 1148, 1207, 1549, 1995]
----------------------------------------------------------------------------------------------------
rows with incorrect pairs for 5-66
[25, 597, 817, 1055, 1227, 1363, 1544, 1692, 1891]
----------------------------------------------------------------------------------------------------
rows with incorrect pairs for 56-65
[48, 509, 518, 620, 1000, 1367, 1517, 1599, 1666]
----------------------------------------------------------------------------------------------------
rows with incorrect pairs for 7-35
[111, 272, 583, 585, 626, 883, 1023, 1374, 1602]
----------------------------------------------------------------------------------------------------
rows with incorrect pairs for 5-67
[132, 204, 433, 636, 10

In [6]:
# Get label name mapping from the system prompt
import re

def get_label_name(label_id, system_message):
    """Extract label name for a given ID from system message."""
    pattern = rf'{label_id}:\s*([^\n]+)'
    match = re.search(pattern, system_message)
    if match:
        return match.group(1).strip()
    return f"Unknown_{label_id}"

# Investigate incorrect classifications for most common pairs
for pair_idx, pair in enumerate(most_common_pairs[:5]):  # Show top 5 pairs
    pair_split = pair.split('-')
    label1_id, label2_id = int(pair_split[0]), int(pair_split[1])
    
    # Get label names from first row's system message
    system_message = inference_data[0]['messages'][0]['content']
    label1_name = get_label_name(label1_id, system_message)
    label2_name = get_label_name(label2_id, system_message)
    
    print(f"\n{'='*120}")
    print(f"PAIR #{pair_idx + 1}: {label1_id} ({label1_name}) <-> {label2_id} ({label2_name})")
    print(f"{'='*120}\n")
    
    # Find all incorrect items for this pair
    p1 = (label1_id, label2_id)
    p2 = (label2_id, label1_id)
    
    incorrect_items_for_pair = []
    for item in incorrect_list:
        if item[1:] == p1 or item[1:] == p2:
            incorrect_items_for_pair.append(item)
    
    # Show first 5 examples
    print(f"Total misclassifications: {len(incorrect_items_for_pair)}")
    print(f"Showing first 5 examples:\n")
    
    for i, (row_idx, predicted_label, gt_label) in enumerate(incorrect_items_for_pair[:5]):
        row_data = inference_data[row_idx]
        user_query = row_data['messages'][1]['content']  # User message
        model_response = row_data['messages'][2]['content']  # Assistant response
        gt_label_name = row_data['metadata']['label_name']
        predicted_label_name = get_label_name(predicted_label, system_message)
        
        print(f"Example {i+1} (Row {row_idx}):")
        print(f"  Query: {user_query}")
        print(f"  Ground Truth: {gt_label} ({gt_label_name})")
        print(f"  Predicted: {predicted_label} ({predicted_label_name})")
        print(f"  Model Response: '{model_response}'")
        print()
    
    print("-"*120)
    print()



PAIR #1: 48 (pending_transfer) <-> 66 (transfer_not_received_by_recipient)

Total misclassifications: 10
Showing first 5 examples:

Example 1 (Row 50):
  Query: I'm still expecting the transaction to be finished
  Ground Truth: 66 (transfer_not_received_by_recipient)
  Predicted: 48 (pending_transfer)
  Model Response: '48'

Example 2 (Row 201):
  Query: Why hasn't the transaction I did arrive yet?
  Ground Truth: 66 (transfer_not_received_by_recipient)
  Predicted: 48 (pending_transfer)
  Model Response: '48'

Example 3 (Row 231):
  Query: why has a transaction to a friend been delayed?
  Ground Truth: 66 (transfer_not_received_by_recipient)
  Predicted: 48 (pending_transfer)
  Model Response: '48'

Example 4 (Row 251):
  Query: What is the length of a transaction completion?
  Ground Truth: 66 (transfer_not_received_by_recipient)
  Predicted: 48 (pending_transfer)
  Model Response: '48'

Example 5 (Row 281):
  Query: The wait for this transfer seems to be taking a very long time.  I

In [8]:
len(incorrect_list)

376

In [None]:
from utils import evaluate_incorrect_classification, evaluate_incorrect_classifications_batch

# Evaluate a single incorrect classification
result = evaluate_incorrect_classification(inference_data[50])
print(result['explanation'])

The model incorrectly classified the user query "I'm still expecting the transaction to be finished" as "pending_transfer" instead of the correct intent "transfer_not_received_by_recipient". Here are the reasons for this misclassification:

1. **Semantic Similarity**: Both intents involve transactions that are not yet completed, which can lead to confusion. "Pending_transfer" suggests a transaction is still in the process of being completed, while "transfer_not_received_by_recipient" implies the transaction is complete from the sender's perspective but not acknowledged by the recipient.

2. **Ambiguous Wording**: The user's wording "expecting the transaction to be finished" can be interpreted in two ways: the transaction is still processing (pending) or it has been sent but not received by the recipient. This ambiguity in the phrasing likely contributed to the model's confusion.

3. **Lack of Specificity**: The user's query lacks specific details that could clarify the intent. For inst

In [10]:
from utils import (
    evaluate_incorrect_classifications_batch, 
    save_evaluation_results
)

# Evaluate all incorrect classifications
results, errors = evaluate_incorrect_classifications_batch(
    inference_data, 
    incorrect_list,
    model="gpt-5-mini",
    max_workers=500
)

# Save results to JSONL file
save_evaluation_results(
    results, 
    inference_data, 
    output_path='./data/evaluation_results.jsonl'
)

Evaluating 376 incorrect classifications with 500 workers...
Using model: gpt-5-mini


Evaluating misclassifications: 100%|██████████| 376/376 [00:49<00:00,  7.67item/s, success=376, errors=0]



✓ Completed: 376 successful, 0 errors
Saved 376 evaluation results to ./data/evaluation_results.jsonl
