In [19]:
import json
import re

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="7"
import sys
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM, LlamaTokenizer
import json
import torch
import re
from tqdm import tqdm
from collections import Counter
from copy import deepcopy

In [2]:
base_model = 'data/Llama-2-7b-chat-hf'
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        load_in_4bit=True,
        local_files_only=True,
        device_map='auto',
)
model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(base_model, local_files_only=True)

In [3]:
data_no_errors = json.load(open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_errors_kept.json', 'r'))
len(data_no_errors)

In [4]:
def detect_error(content: str):
    error_message = re.findall('{"error": ?"([\s\S]*)", "response"', content)
    if len(error_message) == 0:
        error_message = re.findall('{error: ?"([\s\S]*)"', content)
    assert len(error_message) == 1
    error_message = error_message[0]

    if "response" in content and not re.search("\}\s?$", content):
        content = content + "}"
        if not re.search("[\"']\}$", content):
            content = content[:-1] + "\"" + "}"

    if "API not working" in error_message or \
        "Unauthorized" in error_message or \
        "Unsubscribed" in error_message or \
        "Request timeout" in error_message or \
        "Blocked" in error_message or \
        "unavailable" in error_message.lower() or \
        "request invalid, data error. status_code=5" in error_message or \
        "'status': 5" in content:
        return "Request error"
    
    if "No such function name:" in error_message:
        return "Wrong API error"
    
    if "Rate limit per minute" in error_message:
        return "Rate limit error"
    
    if "request invalid, data error" in error_message and "status_code=" not in error_message:
        return "Invalid format error"
    
    try:
        json_response = json.loads(content)

        if "Message error..." in error_message and json_response.get("response") is not None:
            return json_response.get("response")

        if (not json_response.get('response') or \
                 json_response.get("response") is None or \
                 json_response.get("response") == "[]") and \
                not json_response.get("error"):
            return "Empty output"
        
        if json_response["error"].strip():
            return json_response["error"]    
    except: pass

    return False

Чистка примеров с "This is not the first time"

In [5]:
data_no_errors_no_this_is_not_the_first_time = []

for item in data_no_errors:
    flag = True
    for step in item['conversations']:
        if step['from'] == 'user':
            if 'This is not the first time' in step['value']:
                flag = False
    if flag:
        data_no_errors_no_this_is_not_the_first_time.append(item)

len(data_no_errors_no_this_is_not_the_first_time)

Чистка примеров с детектируемыми ошибками

In [6]:
data_no_detectable_errors = []
# for item in data
for item in data_no_errors_no_this_is_not_the_first_time:
    flag = True
    for step in item['conversations']:
        if step['from'] == 'function':
            if detect_error(step['value']):
                flag = False
    if flag:
        data_no_detectable_errors.append(item)
len(data_no_detectable_errors)

Чистка примеров недетектируемыми ошибками

In [7]:
prompt = '''
You are you will be given api calls from some model and api outputs from rapidapi apis. 
Sometimes api return errors. 
It is hard to parse them using rule-based methods. 
An api may respond with an empty output (it could be empty list, or an empty dict), contain a field that states it is an error or etc.
You will be given 5 examples. You should respond only "Yes" if the response does not contain errors from an api and "No" if not.
Please make shure that api returned what the api call requested. If not, return "No"
Examples:
API Call:
Thought: To fulfill the user's request for gathering user information and recent media posts from Instagram for a specific username 'nike', I previously called the 'userinfo_for_instagram_cheapest' function with the argument {"username": "nike"}. By doing so, I efficiently retrieved the desired user data, including their biography, bio links, biographical information, and other relevant details. Now, to proceed with accessing the user's recent media posts, I will call the 'user_medias_for_instagram_cheapest' function with the argument {"user_id": "113294420064920"}. This will allow me to retrieve the most recent media posts of the user 'nike' and provide the user with the requested data effectively.
Action: user_medias_for_instagram_cheapest
Action Input: {
  "user_id": "113294420064920"
}
API Response:
{"error": "", "response": "{'items': [], 'num_results': 0, 'status': 'ok'}"}
Result:
No
###
API Call:
Thought: The user requested the list of languages supported by the LanguageTool API for grammar and writing assistance. To fulfill this request, I need to call the "list_of_languages_for_languagetool" function without any arguments. By doing so, I expect to receive the complete list of supported languages. This information will be crucial for implementing multilingual support in the project, ensuring accurate grammar and writing assistance for different languages.
Action: list_of_languages_for_languagetool
Action Input: {}
API Response:
{"error": "", "response": "[{'name': 'Arabic', 'code': 'ar', 'longCode': 'ar'}, {'name': 'Asturian', 'code': 'ast', 'longCode': 'ast-ES'}, {'name': 'Belarusian', 'code': 'be', 'longCode': 'be-BY'}, {'name': 'Breton', 'code': 'br', 'longCode': 'br-FR'}, {'name': 'Catalan', 'code': 'ca', 'longCode': 'ca-ES'}, {'name': 'Catalan (Valencian)', 'code': 'ca', 'longCode': 'ca-ES-valencia'}, {'name': 'Catalan (Balearic)', 'code': 'ca', 'longCode': 'ca-ES-balear'}, {'name': 'Danish', 'code': 'da', 'longCode': 'da-DK'}, {'name': 'German', 'code': 'de', 'longCode': 'de'}, {'name': 'German (Germany)', 'code': 'de', 'longCode': 'de-DE'}, {'name': 'German (Austria)', 'code': 'de', 'longCode': 'de-AT'}, {'name': 'German (Swiss)', 'code': 'de', 'longCode': 'de-CH'}, {'name': 'Simple German', 'code': 'de-DE-x-simple-language', 'longCode': 'de-DE-x-simple-language'}, {'name': 'Greek', 'code': 'el', 'longCode': 'el-GR'}, {'name': 'English', 'code': 'en', 'longCode': 'en'}, {'name': 'English (US)', 'code': 'en', 'longCode': 'en-U..
Result:
Yes
###
API Call:
Thought: I will now call the "dividends_for_freedom_finance" function to retrieve information about dividends for the symbol "AMZN". By using the argument {"symbol": "AMZN"}, I can focus specifically on the dividends related to the stock "AMZN". This function will help me analyze the dividend history for "AMZN" and provide the user with the requested information.
Action: dividends_for_freedom_finance
Action Input: {
"symbol": "AMZN"
}
API Response:
{"error": "", "response": "{'total': 0, 'offset': 0, 'results': [], 'responseStatus': None}"}
Result:
No
###
API Call:
Thought: To gather the required information for integrating APIs into the company's website, I can call the "list_all_api_for_testing_for_my_use" function. This function will provide me with descriptions, methods, required parameters, and optional parameters for each API. With this information, I can analyze and compare the APIs to make an informed decision.
Action: list_all_api_for_testing_for_my_use
Action Input: {}
API Response:
{"error": "Unauthorized error...", "response": "{'1forge.com': {'added': '2017-05-30T08:34:14.000Z', 'preferred': '0.0.1', 'versions': {'0.0.1': {'added': '2017-05-30T08:34:14.000Z', 'info': {'contact': {'email': 'contact@1forge.com', 'name': '1Forge', 'url': 'http://1forge.com'}, 'description': 'Stock and Forex Data and Realtime Quotes', 'title': '1Forge Finance APIs', 'version': '0.0.1', 'x-apisguru-categories': ['financial'], 'x-logo': {'backgroundColor': '#24292e', 'url': 'https://api.apis.guru/v2/cache/logo/https_1forge.com_assets_images_f-blue.svg'}, 'x-origin': [{'format': 'swagger', 'url': 'http://1forge.com/openapi.json', 'version': '2.0'}], 'x-providerName': '1forge.com'}, 'updated': '2017-06-27T16:49:57.000Z', 'swaggerUrl': 'https://api.apis.guru/v2/specs/1forge.com/0.0.1/swagger.json', 'swaggerYamlUrl': 'https://api.apis.guru/v2/specs/1forge.com/0.0.1/swagger.yaml', 'openapiVer': '2.0', 'link': 'https://api.apis.guru/v2/specs/1forge.com/0.0.1.json'}}}, '1password.com:events': {'added': '2021-07-19...
Result:
No
###
API Call:
Thought: I need to gather user information for a specific Instagram username, 'nike', as requested by the user. They also want access to the user's recent media posts. To achieve this, I will call the 'userinfo_for_instagram_cheapest' function with the argument of {"username": "nike"}. This will help me retrieve the desired data efficiently and fulfill the user's request effectively.
Action: userinfo_for_instagram_cheapest
Action Input: {
  "username": "nike"
}
API Response:
{"error": "", "response": "{'data': {'user': {'biography': 'Spotlighting athlete* and\\xa0\ud83d\udc5f\\xa0stories\\n#BlackLivesMatter, #StopAsianHate, and #NoPrideNoSport', 'bio_links': [{'title': '', 'lynx_url': 'https://l.instagram.com/?u=https%3A%2F%2Flinkin.bio%2Fnike&e=AT063fG4N158ZbZOaZ_A32batiXwR-0Yih5XNa5wfSNPCiDo546PSV0a8mm72ElVxWgNJdgYeHNOTMMTOe_kn7R-C-f2N9ym', 'url': 'https://linkin.bio/nike', 'link_type': 'external'}], 'biography_with_entities': {'raw_text': 'Spotlighting athlete* and\\xa0\ud83d\udc5f\\xa0stories\\n#BlackLivesMatter, #StopAsianHate, and #NoPrideNoSport', 'entities': [{'user': None, 'hashtag': {'name': 'nopridenosport'}}, {'user': None, 'hashtag': {'name': 'stopasianhate'}}, {'user': None, 'hashtag': {'name': 'blacklivesmatter'}}]}, 'blocked_by_viewer': False, 'restricted_by_viewer': None, 'country_block': False, 'eimu_id': '113294420064920', 'external_url': 'https://linkin.bio/nike', 'external_url_linkshimmed': 'https://l.instagram.com/?u=https%3A%2F%2Flinkin.bio%2Fnike&e=AT04S69...
Result:
Yes
###
API Call: 
<api_call>
API Response:
<api_response>
Result:
'''

In [8]:
data_no_detectable_errors[0]

In [9]:
data_no_undetectable_errors = []
for item in tqdm(data_no_detectable_errors[8839:]):
    predictions = []
    flag = True
    for i, step in enumerate(item['conversations']):
        if step['from'] == 'assistant' and i != len(item['conversations'])-1:
            api_call = str(step['value'])
            api_response = str(item['conversations'][i+1]['value'])
            curr_prompt = prompt.replace('<api_call>', api_call)
            curr_prompt = curr_prompt.replace('<api_response>', api_response)
            with torch.no_grad():
                tokenized = tokenizer(curr_prompt, return_tensors="pt").input_ids
                res = model.generate(tokenized, max_length=tokenized.shape[1] + 1)
                pred = tokenizer.batch_decode(res)[0].removeprefix(tokenizer.batch_decode(tokenized)[0])
                print(api_response)
                print(pred)
                print('#' * 100)
                if pred != 'Yes':
                    flag = False
                    break
    if flag:
        data_no_undetectable_errors.append(item)
    

In [10]:
item

In [11]:
for i, item_ in enumerate(tqdm(data_no_detectable_errors)):
    if item_ == item:
        print(i)
        break

In [54]:
5852 / 8839

In [51]:
1111 * 60 / 5

In [27]:
156 * 60 / 5

In [49]:
len(data_no_undetectable_errors)

In [47]:
data_no_undetectable_errors[500]

In [12]:
len(data_no_undetectable_errors)

Сохраним данные для дальнейшего использования 

In [14]:
data_no_undetectable_errors_part_1 = json.load(open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors.json', 'r'))
len(data_no_undetectable_errors_part_1)

In [15]:
json.dump(data_no_undetectable_errors, open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors_part_2.json', 'w'))

In [16]:
data_no_undetectable_errors_all = data_no_undetectable_errors_part_1 + data_no_undetectable_errors

In [18]:
json.dump(data_no_undetectable_errors_all, open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors_final.json', 'w'))

In [4]:
data_no_undetectable_errors = json.load(open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors_final.json', 'r'))

In [5]:
data_no_undetectable_errors_part_1 = json.load(open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors.json', 'r'))
len(data_no_undetectable_errors_part_1)

In [6]:
data_no_undetectable_errors_part_2 = json.load( open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors_part_2.json', 'r'))

In [7]:
len(data_no_undetectable_errors_part_2)

In [8]:
data_no_undetectable_errors_final = data_no_undetectable_errors_part_1 + data_no_undetectable_errors_part_2

Разделим данные на assistant messages, потому что в оригинале модель обучается на последнем сообщении

In [18]:
data_no_undetectable_errors_final[:10]

In [22]:
data_no_undetectable_errors_final_splitted = []
for item in data_no_undetectable_errors_final:
    conversations = item['conversations']
    for i in range(len(conversations)):
        if conversations[i]['from'] == 'assistant':
            splitted_item = deepcopy(item)
            splitted_item['conversations'] = splitted_item['conversations'][:i+1]
            data_no_undetectable_errors_final_splitted.append(splitted_item)

In [24]:
len(data_no_undetectable_errors_final_splitted)

In [25]:
json.dump(data_no_undetectable_errors_final_splitted, open('data/toolbench_new_1311/cleaned_toolllama_G123_dfs_train_downloaded1311_no_undetectable_errors_final_splitted.json', 'w'))

In [33]:
original_train = json.load(open('/data5/okhotnikov/alpaca-lora/ToolBench/toolllama_G123_dfs_train_downloaded1311.json', 'r'))

In [34]:
len(original_train)