### Prompting and generating the data

In [None]:
prompt = '''Generate 30 rows of synthetic data for tool calling where a user provides their user ID, and based on the user ID, the system retrieves all 
the orders the user has previously placed in the database. The generated data should include sample questions and corresponding answers,
demonstrating the interaction between the user and the tool. Try to create as different scenarios as possible, Here are some examples:
example1:
query Perform a database query on the 'orders' table to retrieve orders placed in the last week. answers [{"name": "simulate_query_database", "arguments": {"table": "orders", "conditions": [{"date": {"$gt": "2023-09-20"}}]}}] tools [{"name": "simulate_query_database", "description": "Simulates querying a database based on certain conditions.", "parameters": {"table": {"description": "Name of the table to query.", "type": "str"}, "conditions": {"description": "Conditions for the query, each condition is a dictionary.", "type": "list"}}}, {"name": "get_city_from_zipcode", "description": "Retrieves the city name for a given ZIP code using the Ziptastic API.", "parameters": {"zipcode": {"description": "The ZIP code to look up.", "type": "str"}}}]
example2:
query Search for zip codes in Mexico related to the query 'Mexico City'. answers [{"name": "search_zip_codes_in_mexico", "arguments": {"q": "Mexico City"}}] tools [{"name": "get_vector_tile", "description": "Fetches vector tiles based on the OpenMapTiles schema using the provided x, y, and z coordinates.", "parameters": {"x": {"description": "The X coordinate of the tile.", "type": "int", "default": "0"}, "y": {"description": "The Y coordinate of the tile.", "type": "int", "default": "0"}, "z": {"description": "The zoom level of the tile.", "type": "int", "default": "0"}}}, {"name": "geocode", "description": "Fetch geocoordinates for a given address using the TrueWay Geocoding API.", "parameters": {"address": {"description": "The address that you want to geocode.", "type": "str", "default": "505 Howard St, San Francisco"}, "language": {"description": "The language in which to return results. Default is 'en'.", "type": "str, optional", "default": "en"}, "country": {"description": "The country code to narrow the search results.", "type": "str, optional", "default": ""}, "bounds": {"description": "The bounding box to narrow the search results.", "type": "str, optional", "default": ""}}}, {"name": "reversegeocode", "description": "Obtain the human-readable address for a given geographical location.", "parameters": {"location": {"description": "The latitude and longitude coordinates (comma-separated) for which to obtain the address.", "type": "str", "default": "37.7879493,-122.3961974"}, "language": {"description": "The language in which to return results. Defaults to 'en'.", "type": "str, optional", "default": "en"}}}, {"name": "lookup_coordinates", "description": "Converts US latitude and longitude coordinates into local city information by querying the Reverse Geocode Locator API.", "parameters": {"lat": {"description": "The latitude coordinate.", "type": "int", "default": "40.785091"}, "long": {"description": "The longitude coordinate.", "type": "str", "default": "-73.968285"}}}, {"name": "local_osm_v1_z_x_y_png", "description": "Downloads an OpenStreetMap standard tile image for specified coordinates and zoom level.", "parameters": {"y": {"description": "y tile number.", "type": "int", "default": "3"}, "z": {"description": "Zoom factor between 0 and 19.", "type": "int", "default": "3"}, "x": {"description": "x tile number.", "type": "int", "default": "6"}}}, {"name": "search_zip_codes_in_mexico", "description": "Performs a text search for zip codes in Mexico using the provided query string.", "parameters": {"q": {"description": "The search query string to look up zip codes.", "type": "str", "default": "cerrada san mibuel"}}}]
example3:
query Simulate a database query for the 'Users' table with conditions {'age': 30, 'city': 'Berlin'} and determine if 2024 is a leap year. answers [{"name": "simulate_query_database", "arguments": {"table": "Users", "conditions": [{"age": 30, "city": "Berlin"}]}}, {"name": "is_leap_year", "arguments": {"year": 2024}}] tools [{"name": "is_leap_year", "description": "Checks if a year is a leap year.", "parameters": {"year": {"description": "The year to check.", "type": "int"}}}, {"name": "is_hotel_available", "description": "Checks the availability of a hotel for a given date range.", "parameters": {"hotel": {"description": "The name of the hotel.", "type": "str"}, "city": {"description": "The city where the hotel is located.", "type": "str"}, "checkin": {"description": "The check-in date in the format \"YYYY-MM-DD\".", "type": "str"}, "checkout": {"description": "The check-out date in the format \"YYYY-MM-DD\".", "type": "str"}}}, {"name": "simulate_query_database", "description": "Simulates querying a database based on certain conditions.", "parameters": {"table": {"description": "Name of the table to query.", "type": "str"}, "conditions": {"description": "Conditions for the query, each condition is a dictionary.", "type": "list"}}}] 
Use these examples as a reference to generate synthetic data for tool calling, 
focusing on how users provide their user ID and the system retrieves their order history from the database.'''

messages = []

In [None]:
from openai import OpenAI
import re
import numpy as np
import pandas as pd

client = OpenAI(
    api_key="api_key",
    base_url="https://api.aimlapi.com",
)

response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
    messages=[
        {
            "role": "system",
            "content": "You are an AI assistant who creates synthetic data for tool calling. "
        },
        {
            "role": "user",
            "content": prompt
        },
    ],
    max_tokens=16000,
    temperature=0
)

message = response.choices[0].message.content

#print(f"Assistant: {message}")
messages.append(message)

In [None]:
def split_string(text):
    sections = re.split(r'Row \d+\s*', text)
    return sections[1:]

#message = split_string(message)
messages = [split_string(i) for i in messages]

que = []
ans = []
tool = []
for i in messages:
    try:
        data = i.strip()
        Tools = data.split('Tools:')[1].strip()
        Answer = data.split('Answer:')[1].split('Tools:')[0].strip()
        Question = data.split('Answer:')[1].strip()
        que.append(Question)
        ans.append(Answer)
        tool.append(Tools)
    except:
        que.append(np.nan)
        ans.append(np.nan)
        tool.append(np.nan)

df_syn_2 = pd.DataFrame({'query': que, 'answers': ans, 'tools': tool})
df_final = pd.DataFrame(df_syn_2).dropna()
data_tt_df = df_final.reset_index()
data_tt_df.drop('level_0', axis=1, inplace=True)
# synthetic_data = pd.concat([df_final, data_tt_df], axis=0)
# synthetic_data.to_csv('final_fintune_data.csv')
data_tt_df.to_csv('final_fintune_data.csv')

### Generated data and xlam data checking

In [None]:
import pandas as pd
import os
import json
import torch 
import random

os.environ["HUGGINGFACE_HUB_CACHE_DIR"] = "huggingface_hub_cache"
df = pd.read_csv('final_fintune_data.csv')

data_tt = [] 
df.drop('Unnamed: 0', axis=1, inplace=True)
for _, i in df.iterrows():
    if i['query'].lower().__contains__('query'):
        data_tt.append(i)
    else:
        pass

data_tt_df = pd.DataFrame(data_tt)
data_tt_df.columns = ['index', 'query', 'answers', 'tools']

count = 0
for _, i in data_tt_df.iterrows():
    if i['query'].lower().__contains__('query'):
        print('query {} \n'.format(i['query']).strip(), 'answers {} \n'.format(i['answers']).strip() , 'tools {}'.format(i['tools']).strip())
        count +=1
        if count == 3:
            break

### Code to finetune XLAM with llama3.1 generated data

In [None]:
import pandas as pd
import os
import multiprocessing
import json
import torch 
from datasets import load_dataset
from datasets import Dataset, DatasetDict
import ast
import random
from peft import LoraConfig, prepare_model_for_kbit_training
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
from trl import SFTTrainer, SFTConfig

!export HF_TOKEN='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'
os.environ["HUGGINGFACE_HUB_CACHE_DIR"] = "huggingface_hub_cache"
model_name = "Salesforce/xLAM-1b-fc-r"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name) 

df = pd.read_csv('final_fintune_data.csv')
df = df[['index', 'query', 'answers', 'tools']]
ds = Dataset.from_pandas(df)

def process(row):
    row["query"] = "<user>"+row["query"]+"</user>\n\n"
    tools = []
    for t in json.loads(row["tools"]):
      tools.append(str(t))
    answers = []
    for a in json.loads(row["answers"]):
      answers.append(str(a))
    row["tools"] = "<tools>"+"\n".join(tools)+"</tools>\n\n"
    row["answers"] = "<calls>"+"\n".join(answers)+"</calls>"
    row["text"] = row["query"]+row["tools"]+row["answers"]+tokenizer.eos_token
    return row
ds = ds.map(
    process,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

if torch.cuda.is_bf16_supported():
  os.system('pip install flash_attn')
  compute_dtype = torch.bfloat16
  attn_implementation = 'flash_attention_2'
else:
  compute_dtype = torch.float16
  attn_implementation = 'sdpa'

def QLoRA(ds):

    bnb_config = BitsAndBytesConfig(load_in_8bit=True, bnb_8bit_quant_type="nf4", bnb_8bit_compute_dtype=compute_dtype, bnb_8bit_use_double_quant=True,)
    model = AutoModelForCausalLM.from_pretrained(
            model_name, quantization_config=bnb_config, device_map={"": 0}, attn_implementation=attn_implementation
  )
    model = prepare_model_for_kbit_training(model, gradient_checkpointing_kwargs={'use_reentrant':True})
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.use_cache = False 
    peft_config = LoraConfig(
          lora_alpha=16,
          lora_dropout=0.05,
          r=64,
          bias="none",
          task_type="CAUSAL_LM",
          target_modules= ['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"]
    )
    training_arguments = SFTConfig(
          output_dir="./xLAM",
          optim="adamw_8bit",
          per_device_train_batch_size=2,
          gradient_accumulation_steps=1,
          log_level="debug",
          save_steps=500,
          logging_steps=10,
          learning_rate=1e-4,
          fp16 = not torch.cuda.is_bf16_supported(),
          bf16 = torch.cuda.is_bf16_supported(),
          max_steps=1000,
          warmup_ratio=0.1,
          lr_scheduler_type="linear",
          dataset_text_field="text",
          max_seq_length=512,
    )
    trainer = SFTTrainer(
          model=model,
          train_dataset=ds,
          peft_config=peft_config,
          tokenizer=tokenizer,
          args=training_arguments,
    )
    with torch.cuda.amp.autocast():
        trainer.train()

### Inserting some customer data into DB 

### (THIS IS DUMMY DATA JUST PRESENTED TO CHECK CHATBOT PERFORMANCE)

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy
import random
from sqlalchemy import create_engine, text

def connection_to_db():
    username = 'postgres'
    password = '1234'
    host = 'localhost'
    port = '5432'
    dbname = 'postgres'

    engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{dbname}')
    conn = engine.connect()
    return conn

df_db = pd.read_csv('customers_shopping_data_db.csv')
df_db.drop('customer_id', axis=1, inplace=True)
df_db['customer_id'] = ['C'+str(random.randint(1001, 4000)) for _ in range(len(df_db))]
df_db = df_db[['invoice_no', 'items', 'quantity', 'price', 'customer_id', 'order_date']]

df_db['order_date'] = pd.to_datetime(df_db['order_date'], format='%d.%m.%Y')

conn = connection_to_db()

dtype = {
    'invoice_no': sqlalchemy.types.String,
    'items': sqlalchemy.types.String,
    'quantity': sqlalchemy.types.Integer,
    'price': sqlalchemy.types.Float,
    'customer_id': sqlalchemy.types.String,
    'order_date': sqlalchemy.types.DateTime
}

df_db.to_sql("customers_shopping_data", conn, if_exists="replace", index=False, dtype=dtype)

conn.close()

### Running the model

In [None]:
import torch,os
from peft import PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import json
import torch 
from transformers import AutoModelForCausalLM, AutoTokenizer
import ast
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import pandas as  pd
import numpy as np
import os
import sqlalchemy
import random
from sqlalchemy import create_engine, text

def connection_to_db():
    username = 'postgres'
    password = '1234'
    host = 'localhost'
    port = '5432'
    dbname = 'postgres'

    engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{dbname}')
    conn = engine.connect() 
    return conn

torch.random.manual_seed(0) 

compute_dtype = torch.float16
attn_implementation = 'sdpa'
quantization_config=BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=compute_dtype,
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf8",
)
adapter= "./checkpoint-1000"
model_name = "Salesforce/xLAM-1b-fc-r"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
print(f"Starting to load the model {model_name} into memory")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    torch_dtype=compute_dtype,
    device_map={"": 0},
    attn_implementation=attn_implementation,
)
model = PeftModel.from_pretrained(model, adapter)

In [None]:
def question(query, model, tokenizer):

    task_instruction = """
    You are an expert in composing functions. You are given a question and a set of possible functions. 
    Based on the question, you will need to make one or more function/tool calls to achieve the purpose. 
    If none of the functions can be used, point it out and refuse to answer. 
    If the given question lacks the parameters required by the function, also point it out.
    """.strip()

    format_instruction = """
    The output MUST strictly adhere to the following JSON format, and NO other text MUST be included.
    The example format is as follows. Please make sure the parameter type is correct. If no function call is needed, please make tool_calls an empty list '[]'.
    ```
    {
        "tool_calls": [
        {"name": "func_name1", "arguments": {"argument1": "value1", "argument2": "value2"}},
        ... (more tool calls as required)
        ]
    }
    ```
    """.strip()


    get_customer_orders_api = {
    "name": "get_customer_orders",
    "description": "Retrieve 'N' prior orders for a customer using their customer ID",
    "parameters": {
        "type": "object",
        "properties": {
            "customer_id": {
                "type": "string",
                "description": "The unique identifier for the customer"
            }
        },
        "required": ["limit"]
    }
}
    
    suggest_orders_api = {
        "name": "suggest_orders",
        "description": "Suggest additional orders based on a customer's previous purchases, USE ONLY WHEN CUSTOMERS ARE REQUESTING FOR RECOMMENDATIONS IN THE QUESTION",
        "parameters": {
            "type": "object",
            "properties": {
                "customer_id": {
                    "type": "string",
                    "description": "The unique identifier for the customer"
                }
            },
        }
    }

    get_order_details_api = {
        "name": "get_order_details",
        "description": "Retrieve details of a specific order using the invoice number, USE ONLY WHEN INVOICE IS MENTIONED IN THE QUESTION",
        "parameters": {
            "type": "object",
            "properties": {
                "invoice_no": {
                    "type": "string",
                    "description": "The invoice number of the order"
                }
            },
            "required": ["invoice_no"]
        }
    }

    get_order_history_by_items_api = {
        "name": "get_order_history_by_items",
        "description": "Retrieve all orders of a specific items for a customer, USE ONLY WHEN ITEMS IS MENTIONED IN THE QUESTION",
        "parameters": {
            "type": "object",
            "properties": {
                "customer_id": {
                    "type": "string",
                    "description": "The unique identifier for the customer"
                },
                "items": {
                    "type": "string",
                    "description": "The particular item purchased by the customer"
                }
            },
            "required": ["items"]
        }
    }

    get_frequent_purchases_api = {
        "name": "get_frequent_purchases",
        "description": "Get frequently purchased items by a customer",
        "parameters": {
            "type": "object",
            "properties": {
                "customer_id": {
                    "type": "string",
                    "description": "The unique identifier for the customer"
                }
            },
        }
    }

    calculate_total_spend_api = {
        "name": "calculate_total_spend",
        "description": "Calculate the total amount spent by a customer.",
        "parameters": {
            "type": "object",
            "properties": {
                "customer_id": {
                    "type": "string",
                    "description": "The unique identifier for the customer"
                }
            },
        }
    }

    get_customer_profile_api = {
        "name": "get_customer_profile",
        "description": "Retrieve the profile information of a customer",
        "parameters": {
            "type": "object",
            "properties": {
                "customer_id": {
                    "type": "string",
                    "description": "The unique identifier for the customer"
                }
            },
        }
    }

    openai_format_tools = [get_customer_orders_api , suggest_orders_api, get_order_details_api, get_order_history_by_items_api, 
                           get_frequent_purchases_api, calculate_total_spend_api, get_customer_profile_api]

    def convert_to_xlam_tool(tools):
        ''''''
        if isinstance(tools, dict):
            return {
                "name": tools["name"],
                "description": tools["description"],
                "parameters": {k: v for k, v in tools["parameters"].get("properties", {}).items()}
            }
        elif isinstance(tools, list):
            return [convert_to_xlam_tool(tool) for tool in tools]
        else:
            return tools

    def build_prompt(task_instruction: str, format_instruction: str, tools: list, query: str):
        prompt = f"[BEGIN OF TASK INSTRUCTION]\n{task_instruction}\n[END OF TASK INSTRUCTION]\n\n"
        prompt += f"[BEGIN OF AVAILABLE TOOLS]\n{json.dumps(xlam_format_tools)}\n[END OF AVAILABLE TOOLS]\n\n"
        prompt += f"[BEGIN OF FORMAT INSTRUCTION]\n{format_instruction}\n[END OF FORMAT INSTRUCTION]\n\n"
        prompt += f"[BEGIN OF QUERY]\n{query}\n[END OF QUERY]\n\n"
        return prompt
        
    # Build the input and start the inference
    xlam_format_tools = convert_to_xlam_tool(openai_format_tools)
    content = build_prompt(task_instruction, format_instruction, xlam_format_tools, query)

    messages=[
        { 'role': 'user', 'content': content}
    ]
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)

    # tokenizer.eos_token_id is the id of <|EOT|> token
    outputs = model.generate(inputs, max_new_tokens=512, temperature=1, do_sample=True, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)


In [None]:
def get_customer_orders_def(customer_id='C3628', limit=5, **kwargs):
    data =  []
    conn = connection_to_db()
    query = f"SELECT * FROM customers_shopping_data WHERE customer_id = '{customer_id}' ORDER BY order_date DESC LIMIT '{limit}'"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

def suggest_orders_def(customer_id='C3628', limit=5, similarity_threshold=0.8, **kwargs):
    conn = connection_to_db()
    query = """
    SELECT customer_id, items, SUM(quantity) as total_quantity
    FROM customers_shopping_data
    GROUP BY customer_id, items
    """
    
    purchase_history = pd.read_sql_query(query, conn)
    conn.close()

    user_item_matrix = purchase_history.pivot(index='customer_id', columns='items', values='total_quantity').fillna(0)

    if customer_id not in user_item_matrix.index:
        return []

    model = NearestNeighbors(metric='cosine', algorithm='brute')
    model.fit(user_item_matrix)

    customer_idx = user_item_matrix.index.get_loc(customer_id)
    distances, indices = model.kneighbors(user_item_matrix.iloc[customer_idx, :].values.reshape(1, -1), n_neighbors=len(user_item_matrix))

    similar_customers = [user_item_matrix.index[i] for i, dist in zip(indices.flatten(), distances.flatten()) if dist <= 1 - similarity_threshold and user_item_matrix.index[i] != customer_id]

    if len(similar_customers) < 10:
        similar_customers = user_item_matrix.index[indices.flatten()[1:11]]

    similar_customers_purchases = user_item_matrix.loc[similar_customers].sum(axis=0).sort_values(ascending=False)
    
    target_customer_purchases = user_item_matrix.loc[customer_id]
    target_categories = target_customer_purchases[target_customer_purchases > 0].index
    recommended_products = similar_customers_purchases[~similar_customers_purchases.index.isin(target_categories)]
    
    return recommended_products.head(limit).index.tolist()

def get_order_details_def(invoice_no, **kwargs):
    data =  []
    conn = connection_to_db()
    query = f"SELECT * FROM customers_shopping_data WHERE invoice_no = '{invoice_no}'"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

def get_order_history_by_items_def(customer_id='C3628', **kwargs):
    data =  []
    conn = connection_to_db()
    query = f"SELECT items, SUM(quantity) as total_quantity FROM customers_shopping_data WHERE customer_id = '{customer_id}' GROUP BY items ORDER BY total_quantity DESC"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

def get_frequent_purchases_def(customer_id='C3628', **kwargs):
    data =  []
    conn = connection_to_db()
    query = f"SELECT items, SUM(quantity) as total_quantity FROM customers_shopping_data WHERE customer_id = '{customer_id}' GROUP BY items ORDER BY total_quantity DESC"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

def calculate_total_spend_def(customer_id='C3628', **kwargs):
    data = []
    conn = connection_to_db()
    query = f"SELECT SUM(price) FROM customers_shopping_data WHERE customer_id = '{customer_id}'"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

def get_top_categories_def(customer_id='C3628', **kwargs):
    data =  []
    conn = connection_to_db()
    query = f"SELECT items, SUM(quantity) as total_quantity FROM customers_shopping_data WHERE customer_id = '{customer_id}' GROUP BY items ORDER BY total_quantity DESC LIMIT 5"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

def get_customer_profile_def(customer_id='C3628', **kwargs):
    data =  []
    conn = connection_to_db()
    query = f"SELECT * FROM customers_shopping_data WHERE customer_id = '{customer_id}'"
    result = conn.execute(text(query))
    conn.close()
    for row in result:
        data.append(row)
    return data

functions_list = {
    "get_customer_orders": get_customer_orders_def,
    "suggest_orders": suggest_orders_def,
    "get_order_details": get_order_details_def,
    "get_order_history_by_items": get_order_history_by_items_def,
    "get_frequent_purchases": get_frequent_purchases_def,
    "calculate_total_spend": calculate_total_spend_def,
    "get_top_categories": get_top_categories_def,
    "get_customer_profile": get_customer_profile_def
}

In [None]:
customer_id = 'C2214'
query = f"for customer_id: {customer_id} give me last 7 orders and also recommend me some products for customer_id:{customer_id}"
out = question(query, model, tokenizer)
output_cleaned = ast.literal_eval(out)

In [None]:
output_cleaned

In [None]:
for tool in output_cleaned['tool_calls']:
    print(functions_list[tool['name']](**tool['arguments']))