# 

# Tool2vec embeddings example
# Show casing the tool retrieval capabilities with only query embedding.


In [1]:
import json
from datasets import load_dataset

# Load datasets

In [2]:


ds = load_dataset("squeeze-ai-lab/ToolBank")

train_dataset = load_dataset("squeeze-ai-lab/ToolBank", split="NumpyBank_train")
valid_dataset = load_dataset("squeeze-ai-lab/ToolBank", split="NumpyBank_val")
test_dataset  = load_dataset("squeeze-ai-lab/ToolBank", split="NumpyBank_test")


In [3]:
# Save directory
save_dir = "tool2vec/libs/libraries/"

# Save each split to disk
with open("tool2vec/libs/libraries/NumpyBank_train.json", "w") as f:
    json.dump(train_dataset.to_list(), f, indent=2)
with open("tool2vec/libs/libraries/NumpyBank_valid.json", "w") as f:
    json.dump(valid_dataset.to_list(), f, indent=2)
with open("tool2vec/libs/libraries/NumpyBank_test.json", "w") as f:
    json.dump(test_dataset.to_list(), f, indent=2)

print("Datasets saved successfully to tool2vec/libs/libraries/")

Datasets saved successfully to tool2vec/libs/libraries/


## Creating the train embeddings
### NumpyBank_train_embeddings.json
### NumpyBank_train_embeddings.pkl


In [4]:
!python tool2vec/libs/libraries/toolrag/tool2vec/embedding_generator.py  \
    --data_path tool2vec/libs/libraries/NumpyBank_train.json \
    --output_path output \
    --output_file_name NumpyBank_train_embedded.json \
    --model e5-small

Loading data from tool2vec/libs/libraries/NumpyBank_train.json
Loading e5-small model
  0%|                                                 | 0/15994 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|████████████████████████████████████| 15994/15994 [01:29<00:00, 179.58it/s]
Saving full data to output/NumpyBank_train_embedded.json
Converted 511 unique tools to embeddings.
Saved: output/NumpyBank_train_embedded.pkl


## Creating the val embeddings
### NumpyBank_val_embeddings.json
### NumpyBank_val_embeddings.pkl

In [5]:
!python tool2vec/libs/libraries/toolrag/tool2vec/embedding_generator.py  \
    --data_path tool2vec/libs/libraries/NumpyBank_valid.json \
    --output_path output \
    --output_file_name NumpyBank_val_embedded.json \
    --model e5-small

Loading data from tool2vec/libs/libraries/NumpyBank_valid.json
Loading e5-small model
  0%|                                                  | 0/1998 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████████████████████████████████| 1998/1998 [00:10<00:00, 189.49it/s]
Saving full data to output/NumpyBank_val_embedded.json
Converted 504 unique tools to embeddings.
Saved: output/NumpyBank_val_embedded.pkl


## Creating the Test embeddings
### NumpyBank_test_embeddings.json
### NumpyBank_test_embeddings.pkl

In [6]:
!python tool2vec/libs/libraries/toolrag/tool2vec/embedding_generator.py  \
    --data_path tool2vec/libs/libraries/NumpyBank_test.json \
    --output_path output \
    --output_file_name NumpyBank_test_embedded.json \
    --model e5-small


Loading data from tool2vec/libs/libraries/NumpyBank_test.json
Loading e5-small model
  0%|                                                  | 0/2000 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████████████████████████████████| 2000/2000 [00:09<00:00, 204.52it/s]
Saving full data to output/NumpyBank_test_embedded.json
Converted 506 unique tools to embeddings.
Saved: output/NumpyBank_test_embedded.pkl


# Evaluating the embeddings
### Recall@3 means the recall score if only the top 3 recommended tools are used.
### NDCG@ means Normalized Discounted Cumulative Gain, more sophisticated metric than Recall because it takes into account not just the presence of relevant items, but also their position (rank) in the list and their relevance levels.

In [7]:
!python ./tool2vec/libs/libraries/toolrag/tool2vec/evaluate_t2v_embedding.py \
  --valid_data_path output/NumpyBank_test_embedded.json \
  --t2v_embedding_path output/NumpyBank_test_embedded.pkl \
  --output_file_name output/evaluation_results.txt

Loading data...
Data loaded
Valid/test data length: 2000
The number t2v embeddings: 506
Recall@3: 0.205041666666667
NDCG@3: 0.22895411132583404
Recall@5: 0.25299999999999967
NDCG@5: 0.2472805799983325
Recall@7: 0.2895833333333323
NDCG@7: 0.2645010906822132
Recall@10: 0.33166666666666533
NDCG@10: 0.28196382276237664
Recall@12: 0.35674166666666585
NDCG@12: 0.2914412592753431
Recall@32: 0.5052749999999996
NDCG@32: 0.33789596672139005
Recall@64: 0.626816666666667
NDCG@64: 0.3682432165035977
Recall@128: 0.7638333333333344
NDCG@128: 0.3974330336864197
Recall@256: 0.898475000000001
NDCG@256: 0.4226228550701565
