In [None]:
# Imports
from appworld import AppWorld, load_task_ids
import re
from appworld.task import Task
import json
import requests
import os
from typing import List, Dict, Optional, Tuple
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from transformers.pipelines import pipeline
from jinja2 import Template
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import subprocess
from pathlib import Path
from tqdm.auto import tqdm
from dataclasses import dataclass 
import gc
import tempfile
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
from datasets import Dataset

In [None]:
! huggingface-cli login --token os.getenv("HF_TOKEN")
! appworld install
! appworld download data

In [3]:
# Prompts are modified from AppWorld's prompts
ONE_SHOT_PROMPT = """
USER:
I am your supervisor and you are a super intelligent AI Assistant whose job is to achieve my day-to-day tasks completely autonomously.

To do this, you will need to interact with app/s (e.g., spotify, venmo, etc) using their associated APIs on my behalf. For this you will undertake a *multi-step conversation* using a python REPL environment. That is, you will write the python code and the environment will execute it and show you the result, based on which, you will write python code for the next step and so on, until you've achieved the goal. This environment will let you interact with app/s using their associated APIs on my behalf.

IMPORTANT CODE GENERATION RULES:
1. Each code block must be completely self-contained - variables and functions from previous blocks are NOT available
2. DO NOT reference or try to use variables or functions defined in previous code blocks
3. Only generate valid Python code and comments
4. DO NOT include conversation markers (USER:, ASSISTANT:) or any other text
5. If you need data from a previous step, you must regenerate it or fetch it again
6. Each code block starts fresh with no access to previous variables or functions


Here are three key APIs that you need to know to get more information

# To get a list of apps that are available to you.
print(apis.api_docs.show_app_descriptions())

# To get the list of apis under any app listed above, e.g. supervisor
print(apis.api_docs.show_api_descriptions(app_name='supervisor'))

# To get the specification of a particular api, e.g. supervisor app's show_account_passwords
print(apis.api_docs.show_api_doc(app_name='supervisor', api_name='show_account_passwords'))

Each code execution will produce an output that you can use in subsequent calls. Using these PROVIDED APIs, you can now generate code, that the environment will execute, to solve the task.

For example, consider the task:

My name is: {{ supervisor.first_name }} {{ supervisor.last_name }}. My personal email is {{ supervisor.email }} and phone number is {{ supervisor.phone_number }}.

Task:

What is the password for my Spotify account?

ASSISTANT:
# Okay. Lets first find which apps are available to get the password by looking at the app descriptions.
print(apis.api_docs.show_app_descriptions())

USER:
[
  {
    "name": "api_docs",
    "description": "An app to search and explore API documentation."
  },
  {
    "name": "supervisor",
    "description": "An app to access supervisor's personal information, account credentials, addresses, payment cards, and manage the assigned task."
  },
  ...
  {
    "name": "spotify",
    "description": "A music streaming app to stream songs and manage song, album and playlist libraries."
  },
  {
    "name": "venmo",
    "description": "A social payment app to send, receive and request money to and from others."
  },
  ...
]


ASSISTANT:
# Looks like the supervisor app could help me with that. Lets see what apis are available under this app.
print(apis.api_docs.show_api_descriptions(app_name='supervisor'))


USER:
[
  ...
  "show_account_passwords : Show your supervisor's account passwords."
  ...
]


ASSISTANT:
# I can use `show_account_passwords` to get the passwords. Let me see its detailed specification to understand its arguments and output structure.
print(apis.api_docs.show_api_doc(app_name='supervisor', api_name='show_account_passwords'))

USER:
{
  'app_name': 'supervisor',
  'api_name': 'show_account_passwords',
  'path': '/account_passwords',
  'method': 'GET',
  'description': "Show your supervisor's app account passwords.",
  'parameters': [],
  'response_schemas': {
    'success': [{'account_name': 'string', 'password': 'string'}],
    'failure': {'message': 'string'}
  }
}

ASSISTANT:
# Okay, it requires no arguments. So I can just call it directly.
print(apis.supervisor.show_account_passwords())

USER:
[
  {
    "account_name": "spotify",
    "password": "dummy_spotify_pass"
  },
  {
    "account_name": "file_system",
    "password": "dummy_fs_pass"
  },
  ...
]


ASSISTANT:
# So the Spotify password is an entry in the `passwords` list with the account_name=spotify.
spotify_password = [account_password["account_name"] == "spotify" for account_password in passwords][0]["password"]
print(spotify_password)


USER:
dummy_spotify_pass

ASSISTANT:
# When the task is completed, I need to call apis.supervisor.complete_task(). If there is an answer, I need to pass it as an argument `answer`. I will pass the spotify_password as an answer.
apis.supervisor.complete_task(answer=spotify_password)


USER:
Marked the active task complete.


----------------------------------------------

USER:
**Key instructions and disclaimers**:

1. The email addresses, access tokens and variables (e.g. spotify_password) in the example above were only for demonstration. Obtain the correct information by calling relevant APIs yourself.
2. Only generate valid code blocks, i.e., do not put them in ```...``` or add any extra formatting. Any thoughts should be put as code comments.
3. You can use the variables from the previous code blocks in the subsequent code blocks.
4. Write small chunks of code and only one chunk of code in every step. Make sure everything is working correctly before making any irreversible change.
5. The provided Python environment has access to its standard library. But modules and functions that have a risk of affecting the underlying OS, file system or process are disabled. You will get an error if do call them.
6. Any reference to a file system in the task instructions means the file system *app*, operable via given APIs, and not the actual file system the code is running on. So do not write code making calls to os-level modules and functions.
7. To interact with apps, only use the provided APIs, and not the corresponding Python packages. E.g., do NOT use `spotipy` for Spotify. Remember, the environment only has the standard library.
8. The provided API documentation has both the input arguments and the output JSON schemas. All calls to APIs and parsing its outputs must be as per this documentation.
9. For APIs that return results in "pages", make sure to consider all pages.
10. To obtain current data or time, use Python functions like `datetime.now()` or obtain it from the phone app. Do not rely on your existing knowledge of what the current date or time is.
11. For all temporal requests, use proper time boundaries, e.g., if I ask for something that happened yesterday, make sure to consider the time between 00:00:00 and 23:59:59. All requests are concerning a single, default (no) time zone.
12. Any reference to my friends, family or any other person or relation refers to the people in my phone's contacts list.
13. All my personal information, and information about my app account credentials, physical addresses and owned payment cards are stored in the "supervisor" app. You can access them via the APIs provided by the supervisor app.
14. Once you have completed the task, call `apis.supervisor.complete_task()`. If the task asks for some information, return it as the answer argument, i.e. call `apis.supervisor.complete_task(answer=<answer>)`. For tasks that do not require an answer, just skip the answer argument or pass it as None.
15. The answers, when given, should be just entity or number, not full sentences, e.g., `answer=10` for "How many songs are in the Spotify queue?". When an answer is a number, it should be in numbers, not in words, e.g., "10" and not "ten".
16. You can also pass `status="fail"` in the complete_task API if you are sure you cannot solve it and want to exit.
17. You must make all decisions completely autonomously and not ask for any clarifications or confirmations from me or anyone else.

USER:
Using these APIs, now generate code to solve the actual task:

My name is: {{ supervisor.first_name }} {{ supervisor.last_name }}. My personal email is {{ supervisor.email }} and phone number is {{ supervisor.phone_number }}.
Task:

{{ instruction }}
"""
ZERO_SHOT_PROMPT = """
USER:
I am your supervisor and you are a super intelligent AI Assistant whose job is to achieve my day-to-day tasks completely autonomously.

To do this, you will need to interact with app/s (e.g., spotify, venmo, etc) using their associated APIs on my behalf. For this you will undertake a *multi-step conversation* using a python REPL environment. That is, you will write the python code and the environment will execute it and show you the result, based on which, you will write python code for the next step and so on, until you've achieved the goal. This environment will let you interact with app/s using their associated APIs on my behalf.

IMPORTANT CODE GENERATION RULES:
1. Each code block must be completely self-contained - variables and functions from previous blocks are NOT available
2. DO NOT reference or try to use variables or functions defined in previous code blocks
3. Only generate valid Python code and comments
4. DO NOT include conversation markers (USER:, ASSISTANT:) or any other text
5. If you need data from a previous step, you must regenerate it or fetch it again
6. Each code block starts fresh with no access to previous variables or functions


Here are three key APIs that you need to know to get more information

# To get a list of apps that are available to you.
print(apis.api_docs.show_app_descriptions())

# To get the list of apis under any app listed above, e.g. supervisor
print(apis.api_docs.show_api_descriptions(app_name='supervisor'))

# To get the specification of a particular api, e.g. supervisor app's show_account_passwords
print(apis.api_docs.show_api_doc(app_name='supervisor', api_name='show_account_passwords'))

Each code execution will produce an output that you can use in subsequent calls. Using these PROVIDED APIs, you can now generate code, that the environment will execute, to solve the task.

----------------------------------------------

USER:
**Key instructions and disclaimers**:

1. The email addresses, access tokens and variables (e.g. spotify_password) in the example above were only for demonstration. Obtain the correct information by calling relevant APIs yourself.
2. Only generate valid code blocks, i.e., do not put them in ```...``` or add any extra formatting. Any thoughts should be put as code comments.
3. You can use the variables from the previous code blocks in the subsequent code blocks.
4. Write small chunks of code and only one chunk of code in every step. Make sure everything is working correctly before making any irreversible change.
5. The provided Python environment has access to its standard library. But modules and functions that have a risk of affecting the underlying OS, file system or process are disabled. You will get an error if do call them.
6. Any reference to a file system in the task instructions means the file system *app*, operable via given APIs, and not the actual file system the code is running on. So do not write code making calls to os-level modules and functions.
7. To interact with apps, only use the provided APIs, and not the corresponding Python packages. E.g., do NOT use `spotipy` for Spotify. Remember, the environment only has the standard library.
8. The provided API documentation has both the input arguments and the output JSON schemas. All calls to APIs and parsing its outputs must be as per this documentation.
9. For APIs that return results in "pages", make sure to consider all pages.
10. To obtain current data or time, use Python functions like `datetime.now()` or obtain it from the phone app. Do not rely on your existing knowledge of what the current date or time is.
11. For all temporal requests, use proper time boundaries, e.g., if I ask for something that happened yesterday, make sure to consider the time between 00:00:00 and 23:59:59. All requests are concerning a single, default (no) time zone.
12. Any reference to my friends, family or any other person or relation refers to the people in my phone's contacts list.
13. All my personal information, and information about my app account credentials, physical addresses and owned payment cards are stored in the "supervisor" app. You can access them via the APIs provided by the supervisor app.
14. Once you have completed the task, call `apis.supervisor.complete_task()`. If the task asks for some information, return it as the answer argument, i.e. call `apis.supervisor.complete_task(answer=<answer>)`. For tasks that do not require an answer, just skip the answer argument or pass it as None.
15. The answers, when given, should be just entity or number, not full sentences, e.g., `answer=10` for "How many songs are in the Spotify queue?". When an answer is a number, it should be in numbers, not in words, e.g., "10" and not "ten".
16. You can also pass `status="fail"` in the complete_task API if you are sure you cannot solve it and want to exit.
17. You must make all decisions completely autonomously and not ask for any clarifications or confirmations from me or anyone else.

USER:
Using these APIs, now generate code to solve the actual task:

My name is: {{ supervisor.first_name }} {{ supervisor.last_name }}. My personal email is {{ supervisor.email }} and phone number is {{ supervisor.phone_number }}.
Task:

{{ instruction }}
"""

In [4]:
save_directory = os.getcwd() + "/models"

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForCausalLM.from_pretrained(save_directory)
model = model.to("cuda")
pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.1-8B-Instruct",
    device="cuda",
    pad_token_id=tokenizer.eos_token_id
)

In [5]:
def call_llm(messages: List[Dict[str, str]], pipe: pipeline) -> str:
    prompt = ""
    for msg in messages:
        role = msg["role"].upper()
        content = msg["content"]
        prompt += f"{role}:\n{content}\n\n"
    
    try:
        with torch.inference_mode():
            outputs = pipe(
                prompt, 
                temperature=0.3,
                max_new_tokens=512,
                top_p=0.9,
                repetition_penalty=1.1
            )
        
        response = outputs[0]['generated_text']
        response = response[len(prompt):].strip()
        return response
        
    except Exception as e:
        print(f"Generation failed: {e}")
        raise

In [6]:
# ReActAgent is modified from AppWorld's ReActAgent
class ReActAgent:
    def __init__(self, task: Task, prompt_type: str):
        self.task = task
        self.prompt_type = prompt_type
        self.history: list[dict] = self.prompt_messages()

    def prompt_messages(self) -> list[dict]:
        dictionary = {"supervisor": self.task.supervisor, "instruction": self.task.instruction}
        if self.prompt_type == "one_shot":
            prompt = Template(ONE_SHOT_PROMPT.lstrip()).render(dictionary)
        elif self.prompt_type == "zero_shot":
            prompt = Template(ZERO_SHOT_PROMPT.lstrip()).render(dictionary)
        else:
            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
        messages: list[dict] = []
        last_start = 0
        for match in re.finditer("(USER|ASSISTANT|SYSTEM):\n", prompt):
            last_end = match.span()[0]
            if len(messages) == 0:
                if last_end != 0:
                    raise ValueError(
                        f"Start of the prompt has no assigned role: {prompt[:last_end]}"
                    )
            else:
                messages[-1]["content"] = prompt[last_start:last_end]
            mesg_type = match.group(1).lower()
            messages.append({"role": mesg_type, "content": None})
            last_start = match.span()[1]
        messages[-1]["content"] = prompt[last_start:]
        return messages
    
    def next_code_block(self, pipe, last_execution_output: str | None = None) -> str:
        if last_execution_output is not None:
            self.history.append({"role": "user", "content": last_execution_output})
        code = call_llm(self.history, pipe)
        code = code.replace("ASSISTANT:", "")
        self.history.append({"role": "assistant", "content": code})
        return code

In [7]:
@dataclass
class ExperimentConfig:
    name: str
    prompt_type: str
    model: AutoModelForCausalLM
    tokenizer: AutoTokenizer
    max_interactions: int = 10
    dataset_name: str = "train"
    num_tasks: Optional[int] = None
    output_dir: str = "experiment_results"

class ExperimentRunner:
    def __init__(self, config: ExperimentConfig):
        self.config = config
        self.results = []
        self.stats = {}
        Path(config.output_dir).mkdir(exist_ok=True)
        
    def setup_model(self):
        pipe = pipeline(
            "text-generation",
            model=self.config.model,
            tokenizer=self.config.tokenizer,
            pad_token_id=self.config.tokenizer.eos_token_id,
            max_new_tokens=512
        )
        return pipe, self.config.tokenizer

    def run_experiment(self) -> Dict:
        pipe, tokenizer = self.setup_model()
        task_ids = load_task_ids(self.config.dataset_name)
        if self.config.num_tasks:
            task_ids = task_ids[:self.config.num_tasks]
        for index, task_id in enumerate(task_ids):
            with AppWorld(task_id=task_id, experiment_name=self.config.name) as world:
                result = self._run_single_task(world, pipe, tokenizer, task_id, index)
                self.results.append(result)
        self.stats = self._calculate_statistics()
        self._save_results()
        return self.stats

    def _run_single_task(self, world: AppWorld, pipe, tokenizer, task_id, index) -> Dict:
        agent = ReActAgent(task=world.task, prompt_type=self.config.prompt_type)  
        output = None
        iterations = []
        total_tokens = 0
        print("\n\n" + "*" * 20 + f" Task {index+1}/{self.config.num_tasks} ({task_id})  " + "*" * 20)
        for i in range(self.config.max_interactions):
            code = agent.next_code_block(pipe, output)
            tokens = len(tokenizer.encode(code))
            total_tokens += tokens
            output = world.execute(code)
            if "USER:" in code:
                # Added due to high frequency of LLM hallucinating USER output in the code.
                output = "ERROR: DO NOT SIMULATE USER, YOU ARE ASSISTANT. YOU MUST ONLY GENERATE VALID PYTHON CODE. DO NOT SIMULATE THE OUTPUT OF YOUR PYTHON CODE."                
            iterations.append({
                "iteration": i,
                "code": code,
                "output": output,
                "tokens": tokens,
                "completed": world.task_completed()
            })
            
            if world.task_completed():
                break
                
        return {
            "task_id": task_id,
            "iterations": iterations,
            "completed": world.task_completed(),
            "total_tokens": total_tokens,
            "num_iterations": len(iterations)
        }

    def _calculate_statistics(self) -> Dict:
        completed_tasks = [r for r in self.results if r["completed"]]
        all_iterations = []
        for r in self.results:
            all_iterations.extend(r["iterations"])
        error_iterations = [i for i in all_iterations if "INVALID" in i.get("output", "")]
        stats = {
            "total_tasks": len(self.results),
            "completed_tasks": len(completed_tasks),
            "completion_rate": len(completed_tasks) / len(self.results) if self.results else 0,
            "avg_iterations": np.mean([r["num_iterations"] for r in self.results]) if self.results else 0,
            "error_rate": len(error_iterations) / len(all_iterations) if all_iterations else 0,
            "avg_tokens_per_task": np.mean([r["total_tokens"] for r in self.results]) if self.results else 0,
        }
        return stats

    def _save_results(self):
        output_dir = Path(self.config.output_dir) / self.config.name
        output_dir.mkdir(exist_ok=True)
        with open(output_dir / "results.json", "w") as f:
            json.dump(self.results, f, indent=2)
        with open(output_dir / "stats.json", "w") as f:
            json.dump(self.stats, f, indent=2)

    def generate_plots(self):
        output_dir = Path(self.config.output_dir) / self.config.name        
        plt.bar(["Completed", "Failed"], 
                [self.stats["completed_tasks"], 
                 self.stats["total_tasks"] - self.stats["completed_tasks"]])
        plt.title("Task Completion Rate")
        plt.savefig(output_dir / "completion_rate.png")
        plt.close()
        
        iterations = [r["num_iterations"] for r in self.results]
        sns.histplot(iterations)
        plt.title("Distribution of Iterations per Task")
        plt.xlabel("Number of Iterations")
        plt.savefig(output_dir / "iterations_dist.png")
        plt.close()
        
        tokens = [r["total_tokens"] for r in self.results]
        sns.histplot(tokens)
        plt.title("Distribution of Tokens per Task")
        plt.xlabel("Number of Tokens")
        plt.savefig(output_dir / "tokens_dist.png")
        plt.close()

In [7]:
# Clean up memory
gc.collect()
torch.cuda.empty_cache()

In [None]:
save_directory = os.getcwd() + "/models"

tokenizer = AutoTokenizer.from_pretrained("THUDM/agentlm-7b")
model = AutoModelForCausalLM.from_pretrained("THUDM/agentlm-7b")

tokenizer.save_pretrained(save_directory)
model.save_pretrained(save_directory)

tokenizer = AutoTokenizer.from_pretrained(save_directory)
model = AutoModelForCausalLM.from_pretrained(save_directory)
model = model.to("cuda")

In [None]:
# AGENTLM BENCHMARK
save_directory = os.getcwd() + "/models"
tokenizer = AutoTokenizer.from_pretrained("THUDM/agentlm-7b")
model = AutoModelForCausalLM.from_pretrained("THUDM/agentlm-7b", device_map="auto", torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)

config = ExperimentConfig(
        name="agentlm-7b_one_shot",
        prompt_type="one_shot",
        model=model,
        tokenizer=tokenizer,
        max_interactions=10,
        dataset_name="test_normal",
        num_tasks=15
    )
print("Model Loaded")

In [None]:
# ONE-SHOT BENCHMARK
save_directory = os.getcwd() + "/models"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)

config = ExperimentConfig(
        name="Llama-3.1-8B-ReAct-OneShot",
        prompt_type="one_shot",
        model=model,
        tokenizer=tokenizer,
        max_interactions=10,
        dataset_name="test_normal",
        num_tasks=15
    )
print("Model Loaded")

In [None]:
# ZERO-SHOT BENCHMARK
save_directory = os.getcwd() + "/models"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)

config = ExperimentConfig(
        name="Llama-3.1-8B-ReAct-ZeroShot",
        prompt_type="zero_shot",
        model=model,
        tokenizer=tokenizer,
        max_interactions=10,
        dataset_name="test_normal",
        num_tasks=15
    )
print("Model Loaded")

In [None]:
runner = ExperimentRunner(config)
stats = runner.run_experiment()

runner.generate_plots()

print("\nExperiment Statistics:")
print(json.dumps(stats, indent=2))

In [None]:
!appworld evaluate agentlm-7b_one_shot test_normal
!appworld evaluate Llama-3.1-8B-ReAct-OneShot test_normal
!appworld evaluate Llama-3.1-8B-ReAct-ZeroShot test_normal

In [11]:
# Generate ReActAgent with Together API for synthetic data generation
def call_together_api(prompt: str, api_key: str, model_name: str, 
                          temperature: float = 0.3, max_tokens: int = 512, top_p: float = 0.9) -> str:
    time.sleep(1)
    with tempfile.NamedTemporaryFile(mode='w+', delete=False) as temp:
        request_body = {
            "model": model_name,
            "prompt": prompt,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "top_p": top_p,
            # Added to stop the LLM from hallucinating USER output in the code.
            "stop": ["USER:", "USER\n"]
        }
        json.dump(request_body, temp)
        temp_name = temp.name
    
    request = [
        "curl", "-s", "-X", "POST", "https://api.together.xyz/v1/completions",
        "-H", f"Authorization: Bearer {api_key}",
        "-H", "Content-Type: application/json",
        "-d", f"@{temp_name}"
    ]
    
    result = subprocess.run(request, capture_output=True, text=True, check=True)
    response_json = json.loads(result.stdout)
    os.unlink(temp_name)
    generated_text = response_json.get("choices", [{}])[0].get("text", "")
    if "USER:" in generated_text:
        generated_text = generated_text.split("USER:")[0].strip()
        
    return generated_text


def format(messages: List[Dict[str, str]]) -> str:
    prompt = ""
    for msg in messages:
        role = msg["role"].upper()
        content = msg["content"]
        prompt += f"{role}:\n{content}\n\n"
    return prompt

class ReActAgent:
    def __init__(self, task: Task, prompt_type: str):
        self.task = task
        self.prompt_type = prompt_type
        self.history: list[dict] = self.prompt_messages()

    def prompt_messages(self) -> list[dict]:
        dictionary = {"supervisor": self.task.supervisor, "instruction": self.task.instruction}
        if self.prompt_type == "one_shot":
            prompt = Template(ONE_SHOT_PROMPT.lstrip()).render(dictionary)
        elif self.prompt_type == "zero_shot":
            prompt = Template(ZERO_SHOT_PROMPT.lstrip()).render(dictionary)
        else:
            raise ValueError(f"Invalid prompt type: {self.prompt_type}")
        messages: list[dict] = []
        last_start = 0
        for match in re.finditer("(USER|ASSISTANT|SYSTEM):\n", prompt):
            last_end = match.span()[0]
            if len(messages) == 0:
                if last_end != 0:
                    raise ValueError(
                        f"Start of the prompt has no assigned role: {prompt[:last_end]}"
                    )
            else:
                messages[-1]["content"] = prompt[last_start:last_end]
            mesg_type = match.group(1).lower()
            messages.append({"role": mesg_type, "content": None})
            last_start = match.span()[1]
        messages[-1]["content"] = prompt[last_start:]
        return messages
    
    def next_code_block(self, api_key: str, model_name: str, last_execution_output: str | None = None) -> str:
        if last_execution_output is not None:
            self.history.append({"role": "user", "content": last_execution_output})
        prompt = format(self.history)
        code = call_together_api(prompt, api_key, model_name)
        code = code.replace("ASSISTANT:", "")      
        self.history.append({"role": "assistant", "content": code})
        return code

def generate_synthetic_data(api_key: str, model_name: str, 
                           dataset_name: str, max_interactions: int, num_tasks: Optional[int] = None, output_dir: str = "synthetic_data"):
   
    Path(output_dir).mkdir(exist_ok=True)
    task_ids = load_task_ids(dataset_name)
    if num_tasks:
        task_ids = task_ids[:num_tasks]
    all_results = []
    peft_data = []
    
    for _, task_id in enumerate(tqdm(task_ids, desc="Generating synthetic data")):
        with AppWorld(task_id=task_id, experiment_name="synthetic_data_gen") as world:
            agent = ReActAgent(task=world.task, prompt_type="one_shot")  # Adjust prompt type as needed
            output = None
            iterations = []
            for i in range(max_interactions):
                print(f"Iteration: {i}")
                code = agent.next_code_block(api_key, model_name, output)
                
                if "USER:" in code:
                    output = "ERROR: DO NOT SIMULATE USER, YOU ARE ASSISTANT. YOU MUST ONLY GENERATE VALID PYTHON CODE."
                else:
                    output = world.execute(code)
                iterations.append({
                    "iteration": i,
                    "code": code,
                    "output": output,
                    "completed": world.task_completed()
                })
                if world.task_completed():
                    break

            result = {
                "task_id": task_id,
                "instruction": world.task.instruction,
                "iterations": iterations,
                "completed": world.task_completed()
            }
            all_results.append(result)
            
            context = f"USER: {world.task.instruction}\n\n"
            for i, iteration in enumerate(iterations):
                if i > 0:
                    prev_output = iterations[i-1]["output"]
                    context += f"USER: {prev_output}\n\n"         
                code = iteration["code"]
                peft_data.append({
                    "context": context,
                    "response": code,
                    "task_id": task_id,
                    "turn": i + 1,
                    "completed": world.task_completed()
                })
                
                context += f"ASSISTANT: {code}\n\n"
            
            save_results(all_results, output_dir)
    save_results(all_results, output_dir)
    print(f"Saved {len(all_results)} tasks with {len(peft_data)} examples to {output_dir}")
    return all_results, peft_data

def save_results(all_results, output_dir):
    with open(os.path.join(output_dir, "synthetic_data.json"), "w") as f:
        json.dump(all_results, f, indent=2)


In [None]:
# (train) Multiple runs to achieve larger N
generate_synthetic_data(
    api_key="06c36d1a1df03731104af386a4b91e5d6eefd819b1c6c1b50b2ddaecd7acdfbe",
    model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo",
    dataset_name="train",
    max_interactions=20,
    output_dir="synthetic_data"
)

In [None]:
# (dev) Multiple runs to achieve larger N
generate_synthetic_data(
    api_key="06c36d1a1df03731104af386a4b91e5d6eefd819b1c6c1b50b2ddaecd7acdfbe",
    model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo",
    dataset_name="dev",
    max_interactions=20,
    output_dir="synthetic_data"
)

In [8]:
DATA_PATH = "fine_tuning_dataset.json"
OUTPUT_DIR = "./llama-finetuned"
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
NUM_EPOCHS = 3
LEARNING_RATE = 2e-4
BATCH_SIZE = 4
USE_4BIT = True

with open(DATA_PATH, 'r') as f:
    all_examples = json.load(f)
    
dataset = Dataset.from_list(all_examples)
dataset_splits = dataset.train_test_split(test_size=0.1)
train_dataset = dataset_splits["train"]
eval_dataset = dataset_splits["test"]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize(examples):
    texts = [
        f"User: {instruction}\n\nAssistant: {response}" 
        for instruction, response in zip(examples["instruction"], examples["response"])
    ]
    return tokenizer(
        texts,
        padding="max_length",
        max_length=2048,
        truncation=True,
        return_tensors="pt"
    )

tokenized_train_dataset = train_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["instruction", "response"]
)
tokenized_eval_dataset = eval_dataset.map(
    tokenize,
    batched=True,
    remove_columns=["instruction", "response"]
)
print("Tokenization Complete")

In [None]:
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16,low_cpu_mem_usage=True)

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    warmup_steps=100,
    logging_steps=20,
    save_steps=200,
    evaluation_strategy="steps",
    eval_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard",
    remove_unused_columns=False,
    push_to_hub=False,
    label_names=["input_ids", "attention_mask"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator
)

In [None]:
print("TRAINING STARTED")
trainer.train()   
print(f"Saving final model to {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)

In [None]:

save_directory = "/home/ubuntu/nl-/nlp_final_project/Code/llama-finetuned"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
peft_config = PeftConfig.from_pretrained(save_directory)
base_model = AutoModelForCausalLM.from_pretrained(
            "meta-llama/Llama-3.1-8B-Instruct",
            device_map="auto",
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True
        )

peft_model = PeftModel.from_pretrained(base_model, save_directory)
merged_model = peft_model.merge_and_unload()
config = ExperimentConfig(
        name="Llama-3.1-8B-Finetuned-ReAct-ZeroShot",
        prompt_type="zero_shot",
        model=merged_model,
        tokenizer=tokenizer,
        max_interactions=10,
        dataset_name="test_normal",
        num_tasks=15
    )
print("Model Loaded")

In [None]:
runner = ExperimentRunner(config)
stats = runner.run_experiment()

runner.generate_plots()

print("\nExperiment Statistics:")
print(json.dumps(stats, indent=2))

# References:
- https://huggingface.co/docs (Used for training, loading models, inference, etc.)
- https://appworld.dev/ (Used for prompts and ReAct Agent Framework)
- Claude 3.5 (Used to help with debugging, File Reading/Writing, and chart/graph generation)