## Tutorial: Optimizing a Prompt

![TextGrad](https://github.com/vinid/data/blob/master/logo_full.png?raw=true)

An autograd engine -- for textual gradients!

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Prompt-Optimization.ipynb)
[![GitHub license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)
[![Arxiv](https://img.shields.io/badge/arXiv-2406.07496-B31B1B.svg)](https://arxiv.org/abs/2406.07496)
[![Documentation Status](https://readthedocs.org/projects/textgrad/badge/?version=latest)](https://textgrad.readthedocs.io/en/latest/?badge=latest)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/textgrad)](https://pypi.org/project/textgrad/)
[![PyPI](https://img.shields.io/pypi/v/textgrad)](https://pypi.org/project/textgrad/)

**Objectives:**

* In this tutorial, we will run prompt optimization.

**Requirements:**

* You need to have an OpenAI API key to run this tutorial. This should be set as an environment variable as OPENAI_API_KEY.


In [1]:
#!pip install textgrad # you might need to restart the notebook after installing textgrad

import argparse
import concurrent
from dotenv import load_dotenv
from tqdm import tqdm
import textgrad as tg
from textgrad.tasks import load_task
import numpy as np
import random
load_dotenv(override=True)


True

Let's first define some support functions

In [2]:
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

In [32]:
def eval_sample(item, eval_fn, model):
    """
    This function allows us to evaluate if an answer to a question in the prompt is a good answer.

    """
    x, y = item

    x = tg.Variable(x, requires_grad=False, role_description="query to the language model")
    y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query")
    response = model(x)
    #try:
    eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))
    return int(eval_output_variable.value)
    #except:
    #    eval_output_variable = eval_fn([x, y, response])
    #    eval_output_parsed = eval_fn.parse_output(eval_output_variable)
    #    return int(eval_output_parsed)

In [24]:
def eval_dataset(test_set, eval_fn, model, max_samples: int=None):
    if max_samples is None:
        max_samples = len(test_set)
    accuracy_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        for _, sample in enumerate(test_set):

            future = executor.submit(eval_sample, sample, eval_fn, model)
            futures.append(future)
            if len(futures) >= max_samples:
                break
        tqdm_loader = tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0)
        for future in tqdm_loader:
            acc_item = future.result()
            accuracy_list.append(acc_item)
            tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}")
    return accuracy_list

def eval_dataset_slow(test_set, eval_fn, model, max_samples: int=None):
    if max_samples is None:
        max_samples = len(test_set)
    accuracy_list = []
    tqdm_loader = tqdm(test_set)
    for sample in tqdm_loader:
        acc_item = eval_sample(sample, eval_fn, model)
        accuracy_list.append(acc_item)
        tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}")
    return accuracy_list

In [5]:
def run_validation_revert(system_prompt: tg.Variable, results, model, eval_fn, val_set):
    val_performance = np.mean(eval_dataset(val_set, eval_fn, model))
    previous_performance = np.mean(results["validation_acc"][-1])
    print("val_performance: ", val_performance)
    print("previous_performance: ", previous_performance)
    previous_prompt = results["prompt"][-1]

    if val_performance < previous_performance:
        print(f"rejected prompt: {system_prompt.value}")
        system_prompt.set_value(previous_prompt)
        val_performance = previous_performance

    results["validation_acc"].append(val_performance)

In [17]:
# custom engine
# I need to directly use huggingface locally for constrained generation

from textgrad.engine.base import EngineLM, CachedEngine
import platformdirs
import os
import importlib
from transformers import LogitsProcessorList
from ctrie import DictIndex, ConstrainedStateList, ConstrainedState, ConstrainedLogitsProcessor
import torch

class ChatConstrainedHF(EngineLM, CachedEngine):
    def __init__(
        self,
        model_config_path: str,
        index_config_path: str,
        system_prompt: None,
        **kwargs):
        """
        :param model_string:
        :param system_prompt:
        """

        if model_config_path.endswith('.py'):
            model_config_path = model_config_path[:-3]
        model_module = importlib.import_module(model_config_path)
        self.model_config = getattr(model_module, 'model_config')
        self.model_config.model.eval()

        if system_prompt is None:
            system_prompt = self.model_config.apply_prompt_template()

        if index_config_path.endswith('.py'):
            index_config_path = index_config_path[:-3]
        index_module = importlib.import_module(index_config_path)
        index_config = getattr(index_module, 'index_config')

        assert index_config.rootkey > max(self.model_config.tokenizer.vocab.values())

        root = platformdirs.user_cache_dir("textgrad")
        cache_path = os.path.join(root, f"cache_hf_{model_config_path}.db")

        super().__init__(cache_path=cache_path)

        self.system_prompt = system_prompt

        self.model = self.model_config.model

        self.is_multimodal = False

        self.num_states = self.model_config.generate_args.get('num_beams', 1)
        self.states = ConstrainedStateList(
            [ConstrainedState(
                    begin_pattern = self.model_config.switch_pattern,
                    end_pattern = self.model_config.newline_token,
                    cache_index = DictIndex(end_of_triple=index_config.end_of_triple),
                    subtree_cache = DictIndex(end_of_triple=index_config.end_of_triple),
                    oneleaf_cache = DictIndex(end_of_triple=index_config.end_of_triple),
                ) for _ in range(self.num_states)],
            num_beams=self.model_config.generate_args.get('num_beams', 1),
            batch_size = self.model_config.batch_size,
            pad_token_id = self.model_config.generate_args['pad_token_id'])

        self.constrained_processor = ConstrainedLogitsProcessor(
            index=index_config.index,
            end_token=self.model_config.newline_token,
            states=self.states,
            tokenizer=self.model_config.tokenizer
            )
        self.logits_processor_list = LogitsProcessorList([
            self.constrained_processor
        ])

    def generate(self, prompt: str, system_prompt: str=None):
        sys_prompt_arg = system_prompt if system_prompt else self.system_prompt
        '''
        messages=[
                {"role": "system", "content": sys_prompt_arg},
                {"role": "user", "content": prompt},
            ]
        '''
        self.model_config.model.eval()
        with torch.no_grad():
            #batch = [messages]
            #prompted_batch = list(map(self.model_config.apply_prompt_template, batch))
            prompted_batch = [system_prompt + prompt]

            self.states.reset() # reset caches

            batch_inputs = self.model_config.tokenize_fun(prompted_batch)

            if len(self.states) != batch_inputs.input_ids.shape[0] * self.model_config.generate_args.get('num_beams', 1):
                self.states = self.states[:batch_inputs.input_ids.shape[0] * self.model_config.generate_args.get('num_beams', 1)]
                self.states.batch_size = batch_inputs.input_ids.shape[0]
                self.constrained_processor.states = self.states

            output = self.model_config.model.generate(
                **batch_inputs,
                #logits_processor=self.logits_processor_list,
                **self.model_config.generate_args,
                kwargs = {'constrained_state': self.states}, # passing stat
            )

            self.states.beam_permutation() # final permutation to match final beams

            full_prediction = self.model_config.tokenizer.decode(output[0][len(batch_inputs.input_ids[0]):])
            prediction = self.model_config.get_prediction(full_prediction)
            prediction_complete = bool(prediction)

            response = full_prediction
            self._save_cache(sys_prompt_arg + prompt, response)

            return response

    def __call__(self, prompt, **kwargs):
        return self.generate(prompt, **kwargs)


In [37]:
def load_dataset(dataset_config_path):
    if dataset_config_path.endswith('.py'):
        dataset_config_path = dataset_config_path[:-3]
    dataset_module = importlib.import_module(dataset_config_path)
    dataset = getattr(dataset_module, 'dataset')
    dataset_x_y = [(item['question'], item['answer']['mention']) for item in dataset]
    return dataset_x_y

train_path = 'mintaka_train_ssample200'
train_set = load_dataset(train_path)
val_path = 'mintaka_dev_ssample72'
val_set = load_dataset(val_path)
test_path = 'mintaka_test_ssample200'
test_set = load_dataset(test_path)

In [18]:
del llm_api_test
llm_api_test = ChatConstrainedHF(
    model_config_path="qwen25_1B_model",
    index_config_path="qwen25_index",
    system_prompt=None,
    cache_path=os.path.join(platformdirs.user_cache_dir("textgrad"), "cache_hf_llama1.db"),
)

In [22]:
from textgrad.autograd.string_based_ops import StringBasedFunction
def answer_equality_fn(prediction: tg.Variable, ground_truth_answer: tg.Variable):
    # TODO can consider other stuff like answer in prediction or viceversa
    # can consider if there are triples generated
    return int(str(prediction.value) == str(ground_truth_answer.value))

fn_purpose = "The runtime of string-based function that checks if the prediction is correct."
eval_fn = StringBasedFunction(answer_equality_fn, function_purpose=fn_purpose)

In [10]:
from textgrad.engine.vllm import ChatVLLM

INFO 04-08 14:50:16 [__init__.py:239] Automatically detected platform cuda.


In [11]:
set_seed(12)
llm_api_eval = ChatVLLM(model_string='Qwen/Qwen2.5-1.5B-Instruct', dtype='half') # torch.bfloat16
tg.set_backward_engine(llm_api_eval, override=True)

STARTING_SYSTEM_PROMPT = llm_api_test.system_prompt

INFO 04-08 14:50:27 [config.py:600] This model supports multiple tasks: {'reward', 'embed', 'generate', 'score', 'classify'}. Defaulting to 'generate'.
INFO 04-08 14:50:27 [llm_engine.py:242] Initializing a V0 LLM engine (v0.8.3) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-08 14:50:32 [loader.py:447] Loading weights took 1.27 seconds
INFO 04-08 14:50:32 [model_runner.py:1146] Model loading took 2.8876 GiB and 2.089426 seconds
INFO 04-08 14:50:37 [worker.py:267] Memory profiling takes 4.36 seconds
INFO 04-08 14:50:37 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.57GiB) x gpu_memory_utilization (0.90) = 13.11GiB
INFO 04-08 14:50:37 [worker.py:267] model weights take 2.89GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 2.02GiB; the rest of the memory reserved for KV Cache is 8.15GiB.
INFO 04-08 14:50:37 [executor_base.py:112] # cuda blocks: 19081, # CPU blocks: 9362
INFO 04-08 14:50:37 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 9.32x
INFO 04-08 14:50:41 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:23<00:00,  1.50it/s]

INFO 04-08 14:51:04 [model_runner.py:1598] Graph capturing finished in 23 secs, took 0.19 GiB
INFO 04-08 14:51:04 [llm_engine.py:448] init engine (profile, create kv cache, warmup model) took 32.13 seconds





This is the system prompt we are going to start from:

In [12]:
print(STARTING_SYSTEM_PROMPT)


<|im_start|>system
You are a helpful question-answering assistant that bases its answers on facts from a knowledge base and always respects the prompt.

Process:

    You receive an input question.

    You get some context about the entities in the questions using description and short description.

    You determine the reasoning path needed to answer the question based on the information available.

    You explicitly list relevant facts, one per line, starting with "Fact:".

    You explain your reasoning step by step and provide a detailed answer, justifying it based on the facts.

    You conclude with a short and concise answer that must be grounded by the facts you found.

You have to answer only basing on the facts you find in the knowledge base.
If you cannot find relevant facts in the knowledge base to support an answer you stop and you reply: "I don't know.".
After generating facts, please reason on top of the facts you find and avoid generating lot of useless facts.

You m

In [38]:
# DEBUG
test_set = test_set[:4]
val_set = val_set[:4]
train_set = train_set[:10]

In [39]:
train_loader = tg.tasks.DataLoader(train_set, batch_size=3, shuffle=True)


# Testing the 0-shot performance of the evaluation engine
system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,
                            requires_grad=True,
                            role_description="system prompt to the language model")
model_evaluation = tg.BlackboxLLM(llm_api_eval, system_prompt)

system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,
                            requires_grad=True,
                            role_description="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the QA task")
model = tg.BlackboxLLM(llm_api_test, system_prompt)

optimizer = tg.TextualGradientDescent(engine=llm_api_eval, parameters=[system_prompt])

results = {"test_acc": [], "prompt": [], "validation_acc": []}
results["test_acc"].append(eval_dataset(test_set, eval_fn, model))
results["validation_acc"].append(eval_dataset(val_set, eval_fn, model))
results["prompt"].append(system_prompt.get_value())


Accuracy: 0.0: 100%|██████████| 4/4 [01:27<00:00, 21.96s/it]
Accuracy: 0.0: 100%|██████████| 4/4 [03:27<00:00, 51.91s/it]


In [40]:
for epoch in range(3):
    for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))):
        pbar.set_description(f"Training step {steps}. Epoch {epoch}")
        optimizer.zero_grad()
        losses = []
        for (x, y) in zip(batch_x, batch_y):
            x = tg.Variable(x, requires_grad=False, role_description="query to the language model")
            y = tg.Variable(y, requires_grad=False, role_description="correct answer for the query")
            response = model(x)
            try:
                eval_output_variable = eval_fn(inputs=dict(prediction=response, ground_truth_answer=y))
            except:
                eval_output_variable = eval_fn([x, y, response])
            losses.append(eval_output_variable)
        total_loss = tg.sum(losses)
        total_loss.backward()
        optimizer.step()

        run_validation_revert(system_prompt, results, model, eval_fn, val_set)

        print("sys prompt: ", system_prompt)
        test_acc = eval_dataset(test_set, eval_fn, model)
        results["test_acc"].append(test_acc)
        results["prompt"].append(system_prompt.get_value())
        if steps == 3:
            break

Training step 0. Epoch 0: : 0it [00:00, ?it/s]
[Acessed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it, est. speed input: 142.89 toks/s, output: 34.35 toks/s]

[Acessed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:48<00:00, 48.30s/it, est. speed input: 54.51 toks/s, output: 41.41 toks/s]

[Acessed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.54s/it, est. speed input: 377.96 toks/s, output: 42.36 toks/s]

[Acessed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:42<00:00, 42.98s/it, est. speed input: 59.49 toks/s, output: 46.53 toks/s]

[Acessed prompts:   0%|          |

IndexError: TextualGradientDescent optimizer response could not be indexed. This can happen if the optimizer model cannot follow the instructions. You can try using a stronger model, or somehow reducing the context of the optimization. Response: user
In which books of A Song of Ice and Fire is Jon Snow not the Lord Commander of the Night's Watch?