In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (1

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, AutoConfig, TrainerCallback, EarlyStoppingCallback
from datasets import Dataset, load_dataset
import pandas as pd
import numpy as np
import torch
import os
import random
import gc
import json

In [None]:
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "offline"

In [None]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
torch.backends.cudnn.benchmark = True
# Check GPU availability and memory
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    gc.collect()
else:
    device = torch.device("cpu")
    print("No GPU available, using CPU instead")

Using GPU: Tesla T4
GPU Memory: 15.83 GB


In [None]:
ds = load_dataset("BitAgent/tool_calling_shuffle")
df = pd.DataFrame(ds)

In [None]:
is_testing=False

if is_testing:
    df = df.head(100)
df

Unnamed: 0,train
0,"{'conversation': '[{""role"": ""user"", ""content"":..."
1,"{'conversation': '[{""role"": ""user"", ""content"":..."
2,"{'conversation': '[{""role"": ""user"", ""content"":..."
3,"{'conversation': '[{""role"": ""user"", ""content"":..."
4,"{'conversation': '[{""role"": ""user"", ""content"":..."
...,...
551280,"{'conversation': '[{""role"": ""user"", ""content"":..."
551281,"{'conversation': '[{""role"": ""user"", ""content"":..."
551282,"{'conversation': '[{""role"": ""user"", ""content"":..."
551283,"{'conversation': '[{""role"": ""user"", ""content"":..."


In [None]:
def extract_content_and_description(conversation, tools):
    """
    Extract user content and tool description from conversation and tools.
    """
    try:
        # Parse JSON strings
        conv_data = json.loads(conversation)
        tools_data = json.loads(tools)

        # Get first user message
        user_content = next(
            turn['content'] for turn in conv_data
            if turn['role'] == 'user'
        )

        # Get first tool's description
        tool_description = tools_data[0]['description'] if tools_data else ''

        return user_content, tool_description
    except (json.JSONDecodeError, KeyError, StopIteration):
        return '', ''

In [None]:
# Process the data
processed_data = []
for _, row in df.iterrows():
    try:
        data = row['train']

        content, description = extract_content_and_description(
            data['conversation'],
            data['tools']
        )

        if content and description:
            processed_data.append({
                'content': content,
                'description': description
            })
    except (json.JSONDecodeError, KeyError):
        print(f"Error processing row: {row}")
        continue

result_df = pd.DataFrame(processed_data)
result_df

Unnamed: 0,content,description
0,What was the first named storm of the 2022 Atl...,Returns the name of the first named storm of t...
1,Delete a service called 'old-service' in the '...,Deletes a service in a given Kubernetes namesp...
2,Do we have any backorders pending for 'Super D...,Check if there are any backorders for the spec...
3,What's the 52-week high for Amazon's stock?,Returns the 52-week high for a stock given its...
4,Are there any impending failures predicted for...,Predicts any impending failures for the specif...
...,...,...
551280,Please convert this image to grayscale.,Converts the input image to grayscale.
551281,"Execute a command to restart the pod ""back-end...","A function to restart a given pod, useful for ..."
551282,When was the last time 'Olivia Thompson' visited,Returns the date of the last visit for the spe...
551283,How many times can you replay a message before...,Determine if a message replay is an attack or ...


In [None]:
# Create label mapping
unique_descriptions = result_df['description'].unique()
label_to_int = {desc: idx for idx, desc in enumerate(unique_descriptions)}
int_to_label = {idx: desc for desc, idx in label_to_int.items()}
num_labels = len(label_to_int)
num_labels

73

In [None]:
# Add numeric labels to DataFrame
result_df['label'] = result_df['description'].map(label_to_int)

# Split into train and test
train_size = int(0.8 * len(result_df))
result_df = result_df.sample(frac=1, random_state=42).reset_index(drop=True)
train_df = result_df[:train_size]
test_df = result_df[train_size:]

# Convert to Dataset format
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
print(f"Processed {len(result_df)} examples")
print(f"Number of unique categories: {num_labels}")
print(f"Training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")

Processed 551285 examples
Number of unique categories: 73
Training set size: 441028
Test set size: 110257


In [None]:
model_name = "facebook/bart-large-mnli"
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels  # Set the number of labels
config.id2label = {i: label for label, i in label_to_int.items()}
config.label2id = label_to_int

# Initialize the model with new config but don't load the classification head
nli_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
    ignore_mismatched_sizes=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large-mnli and are newly initialized because the shapes did not match:
- classification_head.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([73]) in the model instantiated
- classification_head.out_proj.weight: found shape torch.Size([3, 1024]) in the checkpoint and torch.Size([73, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# label mapping
def get_description_from_tools(tools_json):
    """Extract description from tools JSON string."""
    try:
        tools = json.loads(tools_json)
        if isinstance(tools, list) and len(tools) > 0:
            return tools[0].get('description', '')
        return ''
    except json.JSONDecodeError:
        return ''

def get_tools_from_train(train_data):
    try:
        return train_data['tools']
    except (TypeError, KeyError):
        return ''

unique_descriptions = df["train"].apply(get_tools_from_train).apply(get_description_from_tools).unique().tolist()

unique_descriptions = [desc for desc in unique_descriptions if desc]

label_to_int = {desc: i for i, desc in enumerate(unique_descriptions)}
int_to_label = {i: desc for desc, i in label_to_int.items()}

print(f"Number of unique labels: {len(label_to_int)}")
print("\nSample of label mapping:")
for i, (desc, idx) in enumerate(label_to_int.items()):
    if i < 5:
        print(f"{idx}: {desc[:100]}...")

Number of unique labels: 73

Sample of label mapping:
0: Returns the name of the first named storm of the 2022 Atlantic hurricane season....
1: Deletes a service in a given Kubernetes namespace. Useful for removing old or unused services....
2: Check if there are any backorders for the specified product....
3: Returns the 52-week high for a stock given its symbol....
4: Predicts any impending failures for the specified engine on the production line....


In [None]:
# Split data into train and test sets
# ----------------------

def prepare_dataset_row(row):
    """Prepare a single row of the dataset."""
    try:
        # Parse conversation to get user content
        conversation = json.loads(row['conversation'])
        user_content = next(
            turn['content'] for turn in conversation
            if turn['role'] == 'user'
        )

        # Parse tools to get description
        tools = json.loads(row['tools'])
        description = tools[0]['description'] if tools else ''

        # Only return if both content and description are valid
        if user_content and description:
            return {
                'content': user_content,
                'description': description,
                'label': label_to_int[description]
            }
    except (json.JSONDecodeError, StopIteration, KeyError):
        pass
    return None

# Prepare clean dataset
clean_data = []
for _, row in df.iterrows():
    prepared_row = prepare_dataset_row(row)
    if prepared_row:
        clean_data.append(prepared_row)

# Convert to DataFrame
clean_df = pd.DataFrame(clean_data)

# Shuffle and split the data
train_size = int(0.8 * len(clean_df))
clean_df = clean_df.sample(frac=1, random_state=42).reset_index(drop=True)
df_train = clean_df.iloc[:train_size]
df_test = clean_df.iloc[train_size:]

print(f"Original dataset size: {len(df)}")
print(f"Clean dataset size: {len(clean_df)}")
print(f"Data split: {len(df_train)} training samples, {len(df_test)} test samples")
print("\nSample training data:")
df_train.head()

Original dataset size: 551285
Clean dataset size: 0
Data split: 0 training samples, 0 test samples

Sample training data:


In [None]:
# Configure training parameters
# ----------------------
print("Configuring training parameters...")
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Explicitly disable all external reporting

    # Basic evaluation and saving parameters
    save_steps=500,
    eval_steps=500,

    # Performance optimization
    fp16=True,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    dataloader_num_workers=4,
)


Configuring training parameters...


In [None]:
def preprocess_function(examples):
    # BART requires proper padding and formatting
    inputs = tokenizer(
        examples["content"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors=None,
        return_attention_mask=True,  # Make sure to include attention mask
    )

    # Add labels
    inputs['labels'] = examples['label']

    return inputs


In [None]:
print("Processing training dataset...")
train_ds = train_ds.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=['content', 'description', 'label'],
    load_from_cache_file=True,
    desc="Tokenizing training data"
)

Processing training dataset...


Tokenizing training data (num_proc=4):   0%|          | 0/441028 [00:00<?, ? examples/s]

In [None]:
print("Processing test dataset...")
test_ds = test_ds.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=['content', 'description', 'label'],
    load_from_cache_file=True,
    desc="Tokenizing test data"
)

Processing test dataset...


Tokenizing test data (num_proc=4):   0%|          | 0/110257 [00:00<?, ? examples/s]

In [None]:

print("Initializing Trainer...")

# Update the trainer with better defaults
trainer = Trainer(
    model=nli_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds
)

Initializing Trainer...


In [None]:
# Move model to GPU
if torch.cuda.is_available():
    nli_model.to(device)

print("Starting training...")
trainer.train()

Starting training...




Step,Training Loss
10,2.7994
20,0.8221
30,0.3546
40,0.1225
50,0.1599
60,0.0939
70,0.1085
80,0.0713
90,0.0569
100,0.0702


KeyboardInterrupt: 

In [None]:
print("Saving model...")
trainer.save_model("./final_model")
print("Training complete!")

Saving model...
Training complete!


In [None]:
with open("./label_mapping.json", "w") as f:
    json.dump({"label_to_int": label_to_int, "int_to_label": int_to_label}, f)
print("Label mapping saved to label_mapping.json")

Label mapping saved to label_mapping.json


In [None]:
# Install required libraries (if not already installed)
!pip install huggingface_hub transformers

repo_id = f"ru4en/bart-large-mnli-tool-router"

# Import necessary libraries
from huggingface_hub import HfApi, create_repo
from getpass import getpass
import os

# 1. Login to Hugging Face
from huggingface_hub import login
token = getpass("Enter your Hugging Face token (Get from https://huggingface.co/settings/tokens): ")
login(token=token)

# 2. Create repository (if it doesn't exist)
try:
    create_repo(repo_id=repo_id, exist_ok=True)
    print(f"Repository {repo_id} is ready")
except Exception as e:
    print(f"Repository already exists or error: {e}")

# 3. Upload model files using the API
api = HfApi()

# Path to your model files
model_path = "./final_model"  # Adjust if your path is different

# Upload all model files
for filename in os.listdir(model_path):
    file_path = os.path.join(model_path, filename)
    if os.path.isfile(file_path):
        print(f"Uploading {filename}...")
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=filename,
            repo_id=repo_id,
        )

print("Model uploaded successfully!")

# 4. Add a model card with metadata
model_card = """---
language: en
license: mit
datasets:
  - SoftAge-AI/prompt-eng_dataset
tags:
  - text-classification
  - prompt-classification
  - bart
  - bart-large-mnli
---

# BART-Large-MNLI Prompt Classification Model

This model is fine-tuned from `facebook/bart-large-mnli` on the prompt engineering dataset for classifying different types of prompts.

## Model description

This model classifies prompts into different categories based on their type.

## Intended uses & limitations

This model is intended for classifying text prompts for AI systems. It should be used for understanding the intent or type of a given prompt.

## Training data

The model was trained on the [SoftAge-AI/prompt-eng_dataset](https://huggingface.co/datasets/SoftAge-AI/prompt-eng_dataset).

## Training procedure

The model was trained for 3 epochs with a batch size of 4 and gradient accumulation steps of 8.

"""

# Create a README.md in the repo
with open("README.md", "w") as f:
    f.write(model_card)

api.upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
)

print("Model card uploaded!")

Enter your Hugging Face token (Get from https://huggingface.co/settings/tokens): ··········
Repository ru4en/bart-large-mnli-tool-router is ready
Uploading config.json...


No files have been modified since last commit. Skipping to prevent empty commit.


Uploading training_args.bin...


training_args.bin:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

Uploading model.safetensors...


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Model uploaded successfully!


No files have been modified since last commit. Skipping to prevent empty commit.


Model card uploaded!
