In [1]:
# Script to add models to Hugging Face collection using huggingface_hub
from huggingface_hub import add_collection_item
from huggingface_hub.utils import HfHubHTTPError

# Collection details
COLLECTION_SLUG = "shikhar-srivastava/tokenizer-study-68716d6709222e7cdfa05de8"
USERNAME = "shikhar-srivastava"

# Model parameters
monolingual_datasets = ["eng_latn", "tha_thai", "urd_arab", "amh_ethi", "vie_latn"]
vocab_sizes = [8192, 16384, 32768, 49152, 65536, 81920, 98304, 114688, 262144] 
tokenizer_types = ["bpe_unscaled", "unigram_unscaled"]
norm_type = "pre"
learning_rates = "1e-4"

# Generate model names
model_names = []
for dataset in monolingual_datasets:
    for vocab_size in vocab_sizes:
        for tokenizer_type in tokenizer_types:
            model_name = f"mono_gold_130m_{norm_type}_lr{learning_rates}_{dataset}_{tokenizer_type}_{vocab_size}"
            model_names.append(model_name)

print(f"Generated {len(model_names)} model names:")
for name in model_names:
    print(f"  - {name}")

# Function to add model to collection
def add_model_to_collection(model_name):
    try:
        # Add model to collection using the correct API function
        collection = add_collection_item(
            collection_slug=COLLECTION_SLUG,
            item_id=f"{USERNAME}/{model_name}",
            item_type="model",
            exists_ok=True,  # Don't fail if already in collection
            note=f"Model with {model_name.split('_')[-2]} script, {model_name.split('_')[-3]} tokenizer, vocab size {model_name.split('_')[-1]}"
        )
        
        print(f"✓ Successfully added {model_name} to collection")
        return True
        
    except HfHubHTTPError as e:
        if "already exists" in str(e).lower() or "duplicate" in str(e).lower():
            print(f"⚠ Model {model_name} already exists in collection")
            return True
        else:
            print(f"✗ Failed to add {model_name}: {str(e)}")
            return False
    except Exception as e:
        print(f"✗ Error adding {model_name}: {str(e)}")
        return False

# Add all models to collection
print("\nAdding models to collection...")
successful_adds = 0
for model_name in model_names:
    if add_model_to_collection(model_name):
        successful_adds += 1

print(f"\nCompleted: {successful_adds}/{len(model_names)} models added successfully")

  from .autonotebook import tqdm as notebook_tqdm


Generated 90 model names:
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_8192
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_8192
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_16384
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_16384
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_32768
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_32768
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_49152
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_49152
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_65536
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_65536
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_81920
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_81920
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_98304
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_98304
  - mono_gold_130m_pre_lr1e-4_eng_latn_bpe_unscaled_114688
  - mono_gold_130m_pre_lr1e-4_eng_latn_unigram_unscaled_114688
  - mono_gold_