# Run inference on base model

./submit_inference_rsync.sh

./submit_inference_rsync.sh [input_file] [config_file] [output_name] [cluster_host]
./submit_inference_rsync.sh data/test.jsonl configs/4b_instruct_vllm_infer.yaml baseline_results ryan@exun

Copy the inference file:
scp ryan@exun:/home/ryan/code/oumi/lab/banking77/notebooks/data/output_2870.jsonl /Users/ryanarman/code/lab/banking77/notebooks/data
output_2870.jsonl  

In [None]:
# Reusable evaluation function for inference results
import json
import re
from typing import List, Tuple, Optional

import re

def measure_accuracy(inference_data):
    """
    Measure accuracy for all items in inference_data.
    
    Args:
        inference_data: List of dictionaries, each containing:
            - 'messages': List with last message containing predicted label in 'content'
            - 'metadata': Dict with 'label' containing ground truth label
    
    Returns:
        Tuple of (accuracy, correct_count, total_count, errors)
    """
    correct = 0
    total = 0
    errors = []
    
    for idx, row in enumerate(inference_data):
        try:
            # Extract predicted label from the last message content
            content = row['messages'][-1]['content'].strip()
            # Extract first integer from the response (handles cases where model adds reasoning)
            match = re.search(r'\b(\d+)\b', content)
            if match:
                inference_class_label = int(match.group(1))
            else:
                raise ValueError(f"No integer found in response: '{content[:50]}...'")
            
            # Extract ground truth label
            gt_label = row['metadata']['label']
            if len(content) > 5:
                print(f"idx: {idx}, gt_label: {gt_label}, inference_class_label: {inference_class_label}, content: {content}")
            
            # Compare and count
            if inference_class_label == gt_label:
                correct += 1
            total += 1
            
        except (KeyError, ValueError, IndexError) as e:
            errors.append((idx, str(e)))
            total += 1  # Still count as total, but mark as incorrect
    
    accuracy = correct / total if total > 0 else 0.0
    
    print(f"Accuracy: {accuracy:.2%} ({correct}/{total})")
    if errors:
        print(f"Errors encountered: {len(errors)}")
        for idx, error_msg in errors[:5]:  # Show first 5 errors
            print(f"  Row {idx}: {error_msg}")
    
    return accuracy, correct, total, errors

Evaluation function defined!


# Evaluate 

In [66]:
import json

train_path = "data/banking77_train.jsonl"
val_path = "data/banking77_val.jsonl"
test_path = "data/banking77_test.jsonl"


def read_jsonl(file_path: str):
    """
    Read a JSONL file and return a list of parsed JSON objects.
    
    Args:
        file_path: Path to the JSONL file
        
    Returns:
        List of dictionaries, one per line in the file
    """
    data = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            line = line.strip()
            if not line:
                continue
            
            try:
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Warning: Line {line_num} is not valid JSON: {e}")
                continue
            except Exception as e:
                print(f"Error: Line {line_num} processing error: {e}")
                continue
    
    return data

print("JSONL reading function defined!")


train_data = read_jsonl(train_path)
val_data = read_jsonl(val_path)
test_data = read_jsonl(test_path)

JSONL reading function defined!


In [67]:
inference_file = '/Users/ryanarman/code/lab/banking77/notebooks/data/output_2873.jsonl'
inference_data = read_jsonl(inference_file)
accuracy, correct, total, errors = measure_accuracy(inference_data)


idx: 332, gt_label: 4, inference_class_label: 4, content: 4


Note: The query "Is there an auto top-up option?" is asking about the availability of automatic top-up functionality. This matches intent ID 4: automatic_top_up.
idx: 335, gt_label: 4, inference_class_label: 4, content: 4


The query "can you assist me with the auto top up?" is asking about automatic top-up functionality, which directly corresponds to intent ID 4: automatic_top_up. This is distinct from other top-up-related intents like manual top-up or top-up limits.
idx: 338, gt_label: 4, inference_class_label: 4, content: 4


The user is asking about setting up automatic top-ups, which corresponds to intent ID 4: automatic_top_up. This is distinct from other top-up-related intents like manual top-up or top-up limits. The query specifically mentions "automatically," indicating the user wants to set up recurring top-ups.
idx: 344, gt_label: 4, inference_class_label: 4, content: 4


Note: The query asks about an "auto top-up

# Train the model

./submit_training_rsync.sh      # With custom datasets:   

./submit_training_rsync.sh data/train.jsonl data/validation.jsonl configs/qwen4b_train_lora.yaml my_training ryan@exun      # With wandb project name:   

./submit_training_rsync.sh data/train.jsonl data/validation.jsonl configs/qwen4b_train_lora.yaml my_training ryan@exun my-wandb-project      # With wandb project and entity (team):   

./submit_training_rsync.sh data/train.jsonl data/validation.jsonl configs/qwen4b_train_lora.yaml my_training ryan@exun my-wandb-project my-team      # Or use environment variables:   export WANDB_PROJECT=my-wandb-project   

./submit_training_rsync.sh   ```

/home/ryan/code/oumi/lab/banking77/notebooks/output/banking77_qwen3_4b_lora_v2_2882

# Run inference on trained model checkpoint

## Run inference with LoRA adapter
cd notebooks/scripts
./submit_inference_rsync.sh \
    data/banking77_test.jsonl \
    configs/4b_instruct_vllm_infer.yaml \
    system_prompt_v2_lora_results \
    output/banking77_qwen3_4b_lora_v2_2882 \
    ryan@exun

In [71]:
inference_file = '/Users/ryanarman/code/lab/banking77/notebooks/data/system_prompt_v2_lora_results_2885.jsonl'
inference_data = read_jsonl(inference_file)
accuracy, correct, total, errors = measure_accuracy(inference_data)


Accuracy: 0.00% (0/3080)
Errors encountered: 3080
  Row 0: 'label'
  Row 1: 'label'
  Row 2: 'label'
  Row 3: 'label'
  Row 4: 'label'


In [72]:
inference_data[0]

{'conversation_id': '92a2af53-abe0-577f-88ed-b029cf56f857',
 'messages': [{'content': 'You are a banking intent classifier. Classify the user\'s query into one of  77 banking intents (output is a single integer ID).\n\nIDs:\n\n0: activate_my_card\n1: age_limit\n2: apple_pay_or_google_pay\n3: atm_support\n4: automatic_top_up\n5: balance_not_updated_after_bank_transfer\n6: balance_not_updated_after_cheque_or_cash_deposit\n7: beneficiary_not_allowed\n8: cancel_transfer\n9: card_about_to_expire\n10: card_acceptance\n11: card_arrival\n12: card_delivery_estimate\n13: card_linking\n14: card_not_working\n15: card_payment_fee_charged\n16: card_payment_not_recognised\n17: card_payment_wrong_exchange_rate\n18: card_swallowed\n19: cash_withdrawal_charge\n20: cash_withdrawal_not_recognised\n21: change_pin\n22: compromised_card\n23: contactless_not_working\n24: country_support\n25: declined_card_payment\n26: declined_cash_withdrawal\n27: declined_transfer\n28: direct_debit_payment_not_recognised\n29