# Upload Datasets to HuggingFace

This notebook uploads the datasets (banking, healthcare, insurance, telecom, investment) from the ../data folder to HuggingFace as `galileo-ai/agent-leaderboard-v2`.


In [13]:
import os
import json
import datasets
import pandas as pd
from glob import glob
from dotenv import load_dotenv
from pprint import pprint
from tqdm.auto import tqdm as notebook_tqdm

load_dotenv("../.env")


True

In [14]:
# Find all domain directories
domain_dirs = glob('../data/*/')
domain_dirs = [d for d in domain_dirs if not d.endswith('datasets/') and not d.endswith('results/')]
domain_dirs = sorted(domain_dirs)

print("Found domains:")
for domain_dir in domain_dirs:
    domain_name = domain_dir.split('/')[-2]
    print(f"  - {domain_name}")
    
    # Check what files are available in each domain
    json_files = glob(f"{domain_dir}*.json")
    if json_files:
        print(f"    Files: {[os.path.basename(f) for f in json_files]}")
    else:
        print("    No JSON files found")
    print()


Found domains:
  - banking
    Files: ['tools.json', 'adaptive_tool_use.json', 'personas.json']

  - healthcare
    Files: ['tools.json', 'adaptive_tool_use.json', 'personas.json']

  - insurance
    Files: ['tools.json', 'adaptive_tool_use.json', 'personas.json']

  - investment
    Files: ['tools.json', 'adaptive_tool_use.json', 'personas.json']

  - telecom
    Files: ['tools.json', 'adaptive_tool_use.json', 'personas.json']



In [15]:
# Expected files for each domain
EXPECTED_FILES = ['adaptive_tool_use.json', 'personas.json', 'tools.json']

# Repository configuration
REPO_NAME = "galileo-ai/agent-leaderboard-v2"
HF_TOKEN = os.getenv("HF_TOKEN")

print("Starting file-type-based upload...")
print(f"Repository: {REPO_NAME}")
print("=" * 50)
print("Note: Using file-type as config and domain as split to handle different schemas")
print()

# Collect data by file type across all domains
file_type_data = {
    'adaptive_tool_use': {},
    'personas': {},
    'tools': {}
}

# First, collect all data organized by file type
for domain_dir in domain_dirs:
    domain_name = domain_dir.split('/')[-2]
    print(f"Scanning domain: {domain_name}")
    
    for filename in EXPECTED_FILES:
        file_path = os.path.join(domain_dir, filename)
        file_type = filename.replace('.json', '')
        
        if os.path.exists(file_path):
            print(f"  ✓ Found: {filename}")
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)
                file_type_data[file_type][domain_name] = json_data
            except Exception as e:
                print(f"  ✗ Error reading {filename}: {e}")
        else:
            print(f"  ✗ Missing: {filename}")

print("\n" + "=" * 50)
print("Starting uploads by file type...")


Starting file-type-based upload...
Repository: galileo-ai/agent-leaderboard-v2
Note: Using file-type as config and domain as split to handle different schemas

Scanning domain: banking
  ✓ Found: adaptive_tool_use.json
  ✓ Found: personas.json
  ✓ Found: tools.json
Scanning domain: healthcare
  ✓ Found: adaptive_tool_use.json
  ✓ Found: personas.json
  ✓ Found: tools.json
Scanning domain: insurance
  ✓ Found: adaptive_tool_use.json
  ✓ Found: personas.json
  ✓ Found: tools.json
Scanning domain: investment
  ✓ Found: adaptive_tool_use.json
  ✓ Found: personas.json
  ✓ Found: tools.json
Scanning domain: telecom
  ✓ Found: adaptive_tool_use.json
  ✓ Found: personas.json
  ✓ Found: tools.json

Starting uploads by file type...


In [16]:
# Improved upload with JSON string conversion for tools
for file_type, domain_data in file_type_data.items():
    if not domain_data:
        print(f"\nSkipping {file_type} - no data found")
        continue
        
    print(f"\nProcessing file type: {file_type}")
    print("-" * 30)
    
    # Upload each domain as a split for this file type
    for domain_name, json_data in domain_data.items():
        print(f"  Uploading {domain_name} domain as split '{domain_name}'...")
        
        try:
            # Convert to pandas DataFrame
            if isinstance(json_data, list):
                df = pd.DataFrame(json_data)
            else:
                # If it's a single object, wrap it in a list
                df = pd.DataFrame([json_data])
            
            # Special handling for tools: convert nested dictionaries to JSON strings
            if file_type == 'tools':
                print(f"    Converting nested data to JSON strings for tools...")
                # Convert 'properties' and 'response_schema' to JSON strings
                for col in ['properties', 'response_schema']:
                    if col in df.columns:
                        df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, dict) else x)
                        print(f"      ✓ Converted {col} to JSON strings")
            
            # Clean up any unwanted columns
            if "index" in df.columns:
                del df["index"]
            if "id" in df.columns and len(df.columns) > 2:  # Keep id if it's meaningful
                del df["id"]
            
            # Create dataset
            dataset = datasets.Dataset.from_pandas(df)
            
            # Upload to HuggingFace
            dataset.push_to_hub(
                REPO_NAME,
                config_name=file_type,    # File type becomes the config
                split=domain_name,        # Domain becomes the split
                token=HF_TOKEN
            )
            
            print(f"    ✓ Successfully uploaded {len(df)} records")
            
        except Exception as e:
            print(f"    ✗ Error uploading {domain_name}: {str(e)}")

print("\n" + "=" * 50)
print("Upload completed with JSON string conversion for tools!")



Processing file type: adaptive_tool_use
------------------------------
  Uploading banking domain as split 'banking'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 689.63ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.76s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading healthcare domain as split 'healthcare'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 389.99ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.86s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading insurance domain as split 'insurance'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 551.59ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.09s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading investment domain as split 'investment'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 625.36ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.50s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading telecom domain as split 'telecom'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 697.31ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.63s/ shards]


    ✓ Successfully uploaded 100 records

Processing file type: personas
------------------------------
  Uploading banking domain as split 'banking'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 776.29ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.39s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading healthcare domain as split 'healthcare'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 933.10ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.94s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading insurance domain as split 'insurance'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1312.77ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.30s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading investment domain as split 'investment'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1073.81ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.33s/ shards]


    ✓ Successfully uploaded 100 records
  Uploading telecom domain as split 'telecom'...


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1043.88ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.38s/ shards]


    ✓ Successfully uploaded 100 records

Processing file type: tools
------------------------------
  Uploading banking domain as split 'banking'...
    Converting nested data to JSON strings for tools...
      ✓ Converted properties to JSON strings
      ✓ Converted response_schema to JSON strings


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 969.11ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.44s/ shards]


    ✓ Successfully uploaded 20 records
  Uploading healthcare domain as split 'healthcare'...
    Converting nested data to JSON strings for tools...
      ✓ Converted properties to JSON strings
      ✓ Converted response_schema to JSON strings


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 936.23ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.34s/ shards]


    ✓ Successfully uploaded 20 records
  Uploading insurance domain as split 'insurance'...
    Converting nested data to JSON strings for tools...
      ✓ Converted properties to JSON strings
      ✓ Converted response_schema to JSON strings


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 405.99ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.42s/ shards]


    ✓ Successfully uploaded 20 records
  Uploading investment domain as split 'investment'...
    Converting nested data to JSON strings for tools...
      ✓ Converted properties to JSON strings
      ✓ Converted response_schema to JSON strings


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 825.16ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.46s/ shards]


    ✓ Successfully uploaded 20 records
  Uploading telecom domain as split 'telecom'...
    Converting nested data to JSON strings for tools...
      ✓ Converted properties to JSON strings
      ✓ Converted response_schema to JSON strings


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1103.76ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.39s/ shards]


    ✓ Successfully uploaded 20 records

Upload completed with JSON string conversion for tools!


In [17]:
# Test downloading the uploaded data to verify it works
print("Testing download functionality...")
print("=" * 40)

try:
    from datasets import load_dataset
    
    print("To test downloads, run these commands after the upload completes:")
    print()
    print("# Example downloads:")
    print(f'banking_tools = load_dataset("{REPO_NAME}", "tools", split="banking")')
    print(f'investment_personas = load_dataset("{REPO_NAME}", "personas", split="investment")')
    print(f'healthcare_scenarios = load_dataset("{REPO_NAME}", "adaptive_tool_use", split="healthcare")')
    print()
    print("# Load all domains for a specific file type:")
    print(f'all_tools = load_dataset("{REPO_NAME}", "tools")')
    print("# This gives access to: all_tools['banking'], all_tools['investment'], etc.")
    print()
    print("✅ This structure solves the schema mismatch issue!")
    print("✅ Each file type maintains its own schema within its config")
    print("✅ Domain information is preserved in both split names and the 'domain' column")
    
except Exception as e:
    print(f"Note: Download test setup complete: {e}")
    print("Run the download commands manually after upload completes.")


Testing download functionality...
To test downloads, run these commands after the upload completes:

# Example downloads:
banking_tools = load_dataset("galileo-ai/agent-leaderboard-v2", "tools", split="banking")
investment_personas = load_dataset("galileo-ai/agent-leaderboard-v2", "personas", split="investment")
healthcare_scenarios = load_dataset("galileo-ai/agent-leaderboard-v2", "adaptive_tool_use", split="healthcare")

# Load all domains for a specific file type:
all_tools = load_dataset("galileo-ai/agent-leaderboard-v2", "tools")
# This gives access to: all_tools['banking'], all_tools['investment'], etc.

✅ This structure solves the schema mismatch issue!
✅ Each file type maintains its own schema within its config
✅ Domain information is preserved in both split names and the 'domain' column


In [18]:
banking_tools = load_dataset("galileo-ai/agent-leaderboard-v2", "tools", split="banking")
investment_personas = load_dataset("galileo-ai/agent-leaderboard-v2", "personas", split="investment")
healthcare_scenarios = load_dataset("galileo-ai/agent-leaderboard-v2", "adaptive_tool_use", split="healthcare")

Generating banking split: 100%|██████████| 20/20 [00:00<00:00, 3822.04 examples/s]
Generating healthcare split: 100%|██████████| 20/20 [00:00<00:00, 5304.88 examples/s]
Generating insurance split: 100%|██████████| 20/20 [00:00<00:00, 6377.23 examples/s]
Generating investment split: 100%|██████████| 20/20 [00:00<00:00, 8113.56 examples/s]
Generating telecom split: 100%|██████████| 20/20 [00:00<00:00, 6580.85 examples/s]
Generating banking split: 100%|██████████| 100/100 [00:00<00:00, 20455.03 examples/s]
Generating healthcare split: 100%|██████████| 100/100 [00:00<00:00, 23718.07 examples/s]
Generating insurance split: 100%|██████████| 100/100 [00:00<00:00, 32403.46 examples/s]
Generating investment split: 100%|██████████| 100/100 [00:00<00:00, 40209.99 examples/s]
Generating telecom split: 100%|██████████| 100/100 [00:00<00:00, 51787.92 examples/s]
Generating banking split: 100%|██████████| 100/100 [00:00<00:00, 19688.80 examples/s]
Generating healthcare split: 100%|██████████| 100/100

In [19]:
def convert_tool_json_strings(tool_record):
    tool = dict(tool_record)

    # Convert 'properties' from JSON string to dict
    if 'properties' in tool and isinstance(tool['properties'], str):
        tool['properties'] = json.loads(tool['properties'])

    # Convert 'response_schema' from JSON string to dict  
    if 'response_schema' in tool and isinstance(tool['response_schema'], str):
        tool['response_schema'] = json.loads(tool['response_schema'])

    return tool

pprint(convert_tool_json_strings(banking_tools[0]))

{'description': 'Retrieves comprehensive account balance information including '
                'current balance, available balance, pending transactions, and '
                "recent activity summary for a customer's bank account.",
 'properties': {'account_number': {'description': 'The bank account number for '
                                                  'which balance information '
                                                  'is requested.',
                                   'title': 'Account_Number',
                                   'type': 'string'},
                'account_type': {'description': 'The type of bank account to '
                                                'check balance for.',
                                 'enum': ['checking',
                                          'savings',
                                          'credit',
                                          'money_market',
                                          'cd'],
      

In [20]:
investment_personas[0]

{'name': 'Richard Chen',
 'age': 58,
 'occupation': 'Semi-retired Financial Advisor and Part-time Consultant',
 'personality_traits': ['methodical', 'skeptical', 'detail-oriented'],
 'tone': 'formal',
 'detail_level': 'comprehensive'}

In [21]:
healthcare_scenarios[0]

{'persona_index': 2,
 'first_message': "I need some help managing my healthcare situation. I was just diagnosed with Type 2 diabetes last week, and I need to find an endocrinologist who accepts Blue Cross insurance and can see me before my conference trip to Boston on May 14th-18th. I also need to get my recent A1C test results from April 28th to bring to the appointment, and set up medication reminders for my new Metformin prescription (500mg twice daily with meals). Could you also find clinical trials for diabetic neuropathy within 25 miles of my home in Cambridge? I'd like to leave feedback about Dr. Patel's dismissive attitude during my diagnosis appointment last Thursday at 2:30pm too. Oh, and my daughter mentioned there's a diabetes management program at the hospital - can you check if it's covered by my insurance plan?",
 'user_goals': ['Find and schedule an appointment with an endocrinologist who accepts Blue Cross insurance before May 14th',
  'Retrieve A1C test results from A