In [250]:
import json,pickle
import os

# Function to load a JSON file
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data
    
# Function to dump data into a JSON file
def dump_json(file_path, data):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=2)
        
# Function to load a pickle file
def load_pickle(file_path):
    with open(file_path, 'rb') as file:
        data = pickle.load(file)
    return data

# Function to dump data into a pickle file
def dump_pickle(file_path, data):
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

# File paths
tools_data_path = '/kaggle/working/tools.json'
index_to_tool_name_path = 'index_to_tool_name.json'
retriever_embeddings_path = 'retriever_embeddings.pkl'
ranker_embeddings_path = 'ranker_embeddings.pkl'

# Load existing data from files
tools_data = load_json(tools_data_path)
#index_to_tool_name = load_json(index_to_tool_name_path)
#retriever_embeddings = load_json(retriever_embeddings_path)


In [251]:
import json

with open('/kaggle/input/colbert6/training_tools.json', 'r') as file:
    tools_data = json.load(file)

tool_names = []
collection = []

# Extract tool names and descriptions
for tool in tools_data:
    tool_names.append(tool['tool_name'])
    tool_desc = f"{tool['tool_description']}. "
    argument_desc = ""
    for argument in tool['arguments']:
        argument_desc += f"{argument['name']} - {argument['description']} "
    tool_desc += argument_desc.strip()
    collection.append(tool_desc)


# Load JSON data from file
file_path = '/kaggle/input/colbert6/train_split_with_ids_without_zero_shot.json'
with open(file_path, 'r') as file:
    queries_data = json.load(file)

# Initialize lists to store queries and tool names used in each query
queries = []
tool_names_list = []

# Extract queries and tool names
for query_info in queries_data:
    question = query_info['question']
    answer = query_info['answer']

    tools_used = [answer_item['tool_name'] for answer_item in answer]

    queries.append(question)
    tool_names_list.append(tools_used)
for i in range(len(collection)):
  collection[i]=tool_names[i]+':'+collection[i]
#collection
#queries
#tool_names_list
# Creating a TSV file for training documents (collection)
with open("collection.tsv", "w", encoding="utf-8") as collection_file:
    for i, item in enumerate(collection):
        collection_file.write(f"{i}\t{item}\n")

# Creating a TSV file for queries
with open("queries.tsv", "w", encoding="utf-8") as queries_file:
    for i, query in enumerate(queries):
        queries_file.write(f"{i}\t{query}\n")

import random

with open("triples.jsonl", "w", encoding="utf-8") as triples_file:
    for i, query in enumerate(queries):
        query_id = i

        # Extracting relevant document indices from the 'i'th index of tool_names_list
        rel_doc_indices = [tool_names.index(doc_id) for doc_id in tool_names_list[i]]

        # Creating triples for each relevant document
        for rel_doc_index in rel_doc_indices:
            # Searching for a non-relevant document index not present in tool_names_list[i]
            irrel_doc_index = random.choice([idx for idx, doc_id in enumerate(tool_names) if doc_id not in tool_names_list[i]])

            # Writing the triple to the triples.jsonl file
            triples_file.write(json.dumps([query_id, rel_doc_index, irrel_doc_index]) + '\n')
triples_data = []
with open("triples.jsonl", "r", encoding="utf-8") as triples_file:
    for line in triples_file:
        triple = json.loads(line.strip())
        triples_data.append(triple)
# Create a dictionary to map document indices to document text
document_index_to_text = {i: text for i, text in enumerate(collection)}

# Retrieve information in the form similar to X
retrieved_info = [
    (queries_data[query_id]['question'], document_index_to_text[rel_doc_index], document_index_to_text[irrel_doc_id])
    for query_id, rel_doc_index, irrel_doc_id in triples_data
]
X=retrieved_info


In [252]:
from neural_cherche import models
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model = models.ColBERT(
    model_name_or_path="/kaggle/input/colbert5/current_checkpoint",
    device=device
)


In [253]:
file_path = '/kaggle/input/colbert6/test_split_with_ids.json'
with open(file_path, 'r') as file:
    queries_data = json.load(file)

# Initialize lists to store queries and tool names used in each query
queries = []
tool_names_list = []

for query_info in queries_data:
    question = query_info['question']
    answer = query_info['answer']

    tools_used = [answer_item['tool_name'] for answer_item in answer]

    queries.append(question)
    tool_names_list.append(tools_used)


#queries
#tool_names_list
with open('/kaggle/input/colbert6/test_tols.json', 'r') as file:
    tools_data = json.load(file)

tool_names = []
collection = []

# Extract tool names and descriptions
for tool in tools_data:
    tool_names.append(tool['tool_name'])
    tool_desc = f"{tool['tool_description']}. "
    argument_desc = ""
    for argument in tool['arguments']:
        argument_desc += f"{argument['description']} "
    tool_desc += argument_desc.strip()
    collection.append(tool_desc)
for i in range(len(collection)):
  collection[i]=tool_names[i]+':'+collection[i]


In [254]:
dump_json("tools.json",tools_data)

In [255]:
# Modify the existing collection to include the index
collection = [
    {"id": i, "text": text}
    for i, text in enumerate(collection)
]


In [256]:
from neural_cherche import models, rank, retrieve
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32

documents = collection

retriever = retrieve.TfIdf(
    key="id",
    on=["text"],
)

model = models.ColBERT(
    model_name_or_path="/kaggle/input/colbert5/current_checkpoint",
    device=device,
)

ranker = rank.ColBERT(
    key="id",
    on=["text"],
    model=model
)

retriever_documents_embeddings = retriever.encode_documents(
    documents=documents,
)



ranker_documents_embeddings = ranker.encode_documents(
    documents=documents,
    batch_size=batch_size,
)
retriever.add(
    documents_embeddings=retriever_documents_embeddings,
)
retriever_queries_embeddings = retriever.encode_queries(
    queries=queries,
)

ranker_queries_embeddings = ranker.encode_queries(
    queries=queries,
    batch_size=batch_size,
)

candidates = retriever(
    queries_embeddings=retriever_queries_embeddings,
    k=1000,
)

scores = ranker(
    documents=candidates,
    queries_embeddings=ranker_queries_embeddings,
    documents_embeddings=ranker_documents_embeddings,
    k=100,
    batch_size=32,
)




100%|██████████| 3/3 [00:02<00:00,  1.39it/s]
100%|██████████| 8/8 [00:00<00:00, 14.81it/s]
TfIdf retriever: 100%|██████████| 1/1 [00:00<00:00, 15.90it/s]
100%|██████████| 3/3 [00:00<00:00, 324.01it/s]
100%|██████████| 3/3 [00:00<00:00, 361.25it/s]
100%|██████████| 3/3 [00:00<00:00, 370.71it/s]
100%|██████████| 3/3 [00:00<00:00, 332.66it/s]
100%|██████████| 3/3 [00:00<00:00, 351.60it/s]
100%|██████████| 3/3 [00:00<00:00, 358.87it/s]
100%|██████████| 3/3 [00:00<00:00, 341.59it/s]
100%|██████████| 3/3 [00:00<00:00, 363.92it/s]
100%|██████████| 3/3 [00:00<00:00, 365.67it/s]
100%|██████████| 3/3 [00:00<00:00, 344.28it/s]
100%|██████████| 3/3 [00:00<00:00, 385.98it/s]
100%|██████████| 3/3 [00:00<00:00, 363.79it/s]
100%|██████████| 3/3 [00:00<00:00, 340.97it/s]
100%|██████████| 3/3 [00:00<00:00, 379.15it/s]
100%|██████████| 3/3 [00:00<00:00, 384.96it/s]
100%|██████████| 2/2 [00:00<00:00, 576.97it/s]
100%|██████████| 3/3 [00:00<00:00, 385.69it/s]
100%|██████████| 2/2 [00:00<00:00, 423.30it/s]

In [257]:
tools_collection=retriever_documents_embeddings
dump_pickle(retriever_embeddings_path, tools_collection)

In [258]:
name_to_index_mapping = {}
name_to_index_mapping = {tool_names[i]: i for i in range(len(tool_names))}
dump_json(index_to_tool_name_path, name_to_index_mapping)

In [259]:
def add_tool(new_tool):
    global tools_collection, name_to_index_mapping, tools_data,ranker_documents_embeddings
    tool_name = new_tool['tool_name']

    # Generate the new tool entry
    tool_desc = f"{new_tool['tool_description']}. "
    argument_desc = ""
    for argument in new_tool['arguments']:
        argument_desc += f"{argument['name']} - {argument['description']} "
    tool_desc += argument_desc.strip()
    
    # Assign the new ID based on the index of the last element in the collection + 1
    last_tool = list(tools_collection.items())[-1]
    last_tool_id = last_tool[0]
    new_tool_id = last_tool_id + 1 if tools_collection else 0
    collection.append({"id": new_tool_id, "text": f"{tool_name}:{tool_desc}"})
    tool_names.append(tool_name)
    print(f"Tool '{tool_name}' has been added with index '{new_tool_id}'")
    
def delete_tool(tool_name):
    global tools_collection, name_to_index_mapping,tools_data, ranker_documents_embeddings,collection
    # Find and remove the tool with the specified name
    if tool_name in name_to_index_mapping:
        tool_index = name_to_index_mapping.pop(tool_name)
        removed_tool = tools_collection.pop(tool_index)
        collection = [item for item in collection if item['id'] != tool_index]
        print(f"Tool '{tool_name}' has been deleted with index '{tool_index}'")
    else:
        print(f"Tool '{tool_name}' not found")

def modify_tool(modified_tool):
    global tools_collection, name_to_index_mapping,tools_data
    tool_name = modified_tool['tool_name']

    # Find the tool with the specified name
    if tool_name in name_to_index_mapping:
        tool_index = name_to_index_mapping[tool_name]

        # Modify the content of the found tool
        tool_desc = f"{modified_tool['tool_description']}. "
        argument_desc = ""
        for argument in modified_tool['arguments']:
            argument_desc += f"{argument['name']} - {argument['description']} "
        tool_desc += argument_desc.strip()
        for item in collection:
            if item['id'] == tool_index:
                item['text']=tool_desc
                break
        print(f"Tool '{tool_name}' has been modified")
    else:
        print(f"Tool '{tool_name}' not found")



In [260]:
'''
# Add a new tool
new_tool = {
  "tool_name": "language_translation",
  "tool_description": "Translate text from one language to another using this versatile API.",
  "arguments": [
    {
      "name": "text",
      "type": "string",
      "description": "The text to be translated."
    },
    {
      "name": "source_language",
      "type": "string",
      "description": "The source language of the text."
    },
    {
      "name": "target_language",
      "type": "string",
      "description": "The target language for the translation."
    }
  ]
}
add_tool(new_tool)
'''
# Modify an existing tool
modified_tool = {
    "tool_name": "prioritize_objects",
    "tool_description": "Modified description",
    "arguments": [
        {"name": "ModifiedArgument1", "description": "Modified description", "type": "string", "example": ""}
    ]
}
modify_tool(modified_tool)

# Delete a tool
delete_tool("image_caption")

# Add a new tool1
new_tool1 = {
    "tool_name": "NewTool1",
    "tool_description": "Description of NewTool1",
    "arguments": [
        {"name": "Argument1", "description": "Description of Argument1", "type": "string", "example": ""},
        {"name": "Argument2", "description": "Description of Argument2", "type": "int", "example": ""}
    ]
}
add_tool(new_tool1)


Tool 'prioritize_objects' has been modified
Tool 'image_caption' has been deleted with index '14'
Tool 'NewTool1' has been added with index '87'


In [261]:
from neural_cherche import models, rank, retrieve
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 32

documents = collection

retriever = retrieve.TfIdf(
    key="id",
    on=["text"],
)

model = models.ColBERT(
    model_name_or_path="/kaggle/input/colbert5/current_checkpoint",
    device=device,
)

ranker = rank.ColBERT(
    key="id",
    on=["text"],
    model=model
)

retriever_documents_embeddings = retriever.encode_documents(
    documents=documents,
)


ranker_documents_embeddings = ranker.encode_documents(
    documents=documents,
    batch_size=batch_size,
)
retriever.add(
    documents_embeddings=retriever_documents_embeddings,
)
retriever_queries_embeddings = retriever.encode_queries(
    queries=queries,
)

ranker_queries_embeddings = ranker.encode_queries(
    queries=queries,
    batch_size=batch_size,
)

candidates = retriever(
    queries_embeddings=retriever_queries_embeddings,
    k=1000,
)

scores = ranker(
    documents=candidates,
    queries_embeddings=ranker_queries_embeddings,
    documents_embeddings=ranker_documents_embeddings,
    k=100,
    batch_size=32,
)




100%|██████████| 3/3 [00:02<00:00,  1.37it/s]
100%|██████████| 8/8 [00:00<00:00, 14.65it/s]
TfIdf retriever: 100%|██████████| 1/1 [00:00<00:00, 16.15it/s]
100%|██████████| 3/3 [00:00<00:00, 356.19it/s]
100%|██████████| 3/3 [00:00<00:00, 350.66it/s]
100%|██████████| 3/3 [00:00<00:00, 385.61it/s]
100%|██████████| 3/3 [00:00<00:00, 374.36it/s]
100%|██████████| 3/3 [00:00<00:00, 387.33it/s]
100%|██████████| 3/3 [00:00<00:00, 358.09it/s]
100%|██████████| 3/3 [00:00<00:00, 340.51it/s]
100%|██████████| 3/3 [00:00<00:00, 380.56it/s]
100%|██████████| 3/3 [00:00<00:00, 350.20it/s]
100%|██████████| 3/3 [00:00<00:00, 372.77it/s]
100%|██████████| 3/3 [00:00<00:00, 353.07it/s]
100%|██████████| 3/3 [00:00<00:00, 383.54it/s]
100%|██████████| 3/3 [00:00<00:00, 378.70it/s]
100%|██████████| 3/3 [00:00<00:00, 387.66it/s]
100%|██████████| 3/3 [00:00<00:00, 350.10it/s]
 50%|█████     | 1/2 [00:00<00:00, 245.87it/s]
100%|██████████| 3/3 [00:00<00:00, 379.63it/s]
100%|██████████| 2/2 [00:00<00:00, 412.62it/s]

In [262]:
len(tool_names)

88

In [263]:
# The output you provided
results = scores
correct = 0
# List to store results for each inner list
all_results = []

# Iterate through each inner list
for inner_list in results:
    # Sort the inner list by similarity in descending order
    sorted_inner_list = sorted(inner_list, key=lambda x: x['similarity'], reverse=True)

    # Extract the top 10 elements
    top_10_elements = sorted_inner_list[:10]

    # List to store results for the current inner list
    current_results = []

    # Print the corresponding elements from the collection using the 'id'
    for element in top_10_elements:
        element_id = element['id']
        text_value = tool_names[element_id]  # Extract value before colon
        corresponding_element = {'id': element_id, 'text': text_value}
        current_results.append(corresponding_element)

    # Add the list for the current inner list to the overall results list
    all_results.append(current_results)

# Check if all elements in tool_names_list[I] are present in the respective top_10_tool[I]
for i, tool_names_inner in enumerate(tool_names_list):
    top_10_tool = all_results[i]

    # Check if all elements in tool_names_inner are present in top_10_tool
    all_present = all(tool_name in [element['text'] for element in top_10_tool] for tool_name in tool_names_inner)

    if all_present:
        correct = correct + 1
        #print(f"All elements in tool_names_list[{i}] are present in the respective top_10_tool[{i}]")
    else:
        print(top_10_tool)
        print(tool_names_inner)
        print()

print(len(tool_names_list))
print(correct)


[{'id': 43, 'text': 'dictionary'}, {'id': 20, 'text': 'document_q_a'}, {'id': 19, 'text': 'search_engine'}, {'id': 46, 'text': 'wiki'}, {'id': 71, 'text': 'get_similar_work_items'}, {'id': 8, 'text': 'translate'}, {'id': 7, 'text': 'query_scene'}, {'id': 57, 'text': 'geocoding'}, {'id': 72, 'text': 'search_object_by_name'}, {'id': 58, 'text': 'nearby_restaurants'}]
['image_caption']

[{'id': 47, 'text': 'get_user_token'}, {'id': 10, 'text': 'appointment_registration'}, {'id': 30, 'text': 'query_agenda'}, {'id': 6, 'text': 'query_meeting'}, {'id': 27, 'text': 'add_meeting'}, {'id': 48, 'text': 'addagenda'}, {'id': 36, 'text': 'get_today'}, {'id': 55, 'text': 'hotel_availability'}, {'id': 5, 'text': 'check_token'}, {'id': 1, 'text': 'emergency_knowledge'}]
['get_user_token', 'addagenda', 'addalarm']

[{'id': 71, 'text': 'get_similar_work_items'}, {'id': 20, 'text': 'document_q_a'}, {'id': 19, 'text': 'search_engine'}, {'id': 46, 'text': 'wiki'}, {'id': 8, 'text': 'translate'}, {'id': 43,

In [264]:
(tool_names)

['query_stock',
 'emergency_knowledge',
 'delete_alarm',
 'register_user',
 'play_music',
 'check_token',
 'query_meeting',
 'query_scene',
 'translate',
 'timed_switch',
 'appointment_registration',
 'addalarm',
 'query_alarm',
 'add_scene',
 'image_caption',
 'speech_recognition',
 'delete_agenda',
 'symptomsearch',
 'delete_scene',
 'search_engine',
 'document_q_a',
 'query_balance',
 'record_health_data',
 'open_bank_account',
 'query_registration',
 'modify_reminder',
 'modify_registration',
 'add_meeting',
 'forgot_password',
 'cancel_registration',
 'query_agenda',
 'cancel_timed_switch',
 'book_hotel',
 'calculator',
 'query_history_today',
 'delete_meeting',
 'get_today',
 'modifymeeting',
 'delete_reminder',
 'modify_agenda',
 'query_reminder',
 'modify_alarm',
 'add_reminder',
 'dictionary',
 'send_email',
 'query_health_data',
 'wiki',
 'get_user_token',
 'addagenda',
 'delete_account',
 'organization_members',
 'travel_status',
 'user_movie_preferences',
 'account_info',
 

In [265]:
import json

# Assuming your JSON file is named "tool_info.json"
json_file_path = "/kaggle/input/colbert6/test_tols.json"

# Read the JSON file and store the tool information in a list
with open(json_file_path, "r", encoding="utf-8") as file:
    tool_info = json.load(file)

# List to store results for each inner list
all_results = {}

# Iterate through each inner list
for i, inner_list in enumerate(results, start=1):
    # Sort the inner list by similarity in descending order
    sorted_inner_list = sorted(inner_list, key=lambda x: x['similarity'], reverse=True)
    
    # Extract the top 10 elements
    top_10_elements = sorted_inner_list[:10]

    # List to store results for the current inner list
    current_results = []

    # Print the corresponding elements from the tool_info using the 'id'
    for j, element in enumerate(top_10_elements, start=1):
        element_id = element['id']
        # Corrected line to extract element_name
        element_name = next(x['text'].split(":")[0] for x in collection if x['id'] == element_id)
        tool_details = next((tool for tool in tool_info if tool['tool_name'] == element_name), None)
        if tool_details:
            current_results.append(tool_details)
    
    # Add the list for the current inner list to the overall results dictionary
    all_results[f"question_{i}"] = current_results

# Print the top 10 tools in JSON format with subheadings
top_10_tools_json = json.dumps(all_results, indent=2)

# Write the top 10 tools to a new JSON file
output_file_path = "top_10_tools.json"
with open(output_file_path, "w", encoding="utf-8") as output_file:
    output_file.write(top_10_tools_json)

print(f"Top 10 tools written to '{output_file_path}'.")


Top 10 tools written to 'top_10_tools.json'.
