## Tutorial: Optimizing a Prompt

![TextGrad](https://github.com/vinid/data/blob/master/logo_full.png?raw=true)

An autograd engine -- for textual gradients!

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zou-group/TextGrad/blob/main/examples/notebooks/Prompt-Optimization.ipynb)
[![GitHub license](https://img.shields.io/badge/License-MIT-blue.svg)](https://lbesson.mit-license.org/)
[![Arxiv](https://img.shields.io/badge/arXiv-2406.07496-B31B1B.svg)](https://arxiv.org/abs/2406.07496)
[![Documentation Status](https://readthedocs.org/projects/textgrad/badge/?version=latest)](https://textgrad.readthedocs.io/en/latest/?badge=latest)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/textgrad)](https://pypi.org/project/textgrad/)
[![PyPI](https://img.shields.io/pypi/v/textgrad)](https://pypi.org/project/textgrad/)

**Objectives:**

* In this tutorial, we will run prompt optimization.

**Requirements:**

* You need to have an OpenAI API key to run this tutorial. This should be set as an environment variable as OPENAI_API_KEY.


In [1]:
#@title Python Dependencies
!uv pip install -qU textgrad httpx-retry --system

*** WARNING *** : You need to restart the session now!

In [1]:
#@title Python Imports

import argparse
import concurrent
from tqdm import tqdm
import textgrad as tg
import numpy as np
import random

In [2]:
#@title Setup Secrets
import os

from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('SCW_SECRET_KEY')

In [3]:
#@title Helper Functions
def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)

In [4]:
def eval_sample(item, eval_fn, model):
    """
    This function allows us to evaluate if an simplified version of the text is clear and easy to understand.

    """
    x, y = item
    x = tg.Variable(x, requires_grad=False, role_description="text to be clarified for readability and ease of understanding")
    y = tg.Variable(y, requires_grad=False, role_description="clear, easy to understand text")
    response = model(x)
    eval_output_variable = eval_fn(inputs=dict(prediction=response, target=y))
    return int(eval_output_variable.value)

In [5]:
def eval_dataset(test_set, eval_fn, model, max_samples: int=None):
    if max_samples is None:
        max_samples = len(test_set)
    accuracy_list = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        futures = []
        for _, sample in enumerate(test_set):

            future = executor.submit(eval_sample, sample, eval_fn, model)
            futures.append(future)
            if len(futures) >= max_samples:
                break
        tqdm_loader = tqdm(concurrent.futures.as_completed(futures), total=len(futures), position=0)
        for future in tqdm_loader:
            acc_item = future.result()
            accuracy_list.append(acc_item)
            tqdm_loader.set_description(f"Accuracy: {np.mean(accuracy_list)}")
    return accuracy_list

In [6]:
def run_validation_revert(system_prompt: tg.Variable, results, model, eval_fn, val_set):
    val_performance = np.mean(eval_dataset(val_set, eval_fn, model))
    previous_performance = np.mean(results["validation_acc"][-1])
    print("val_performance: ", val_performance)
    print("previous_performance: ", previous_performance)
    previous_prompt = results["prompt"][-1]

    if val_performance < previous_performance:
        print(f"rejected prompt: {system_prompt.value}")
        system_prompt.set_value(previous_prompt)
        val_performance = previous_performance

    results["validation_acc"].append(val_performance)

In [7]:
#@title Setup Engines
from textgrad.engine_experimental import OpenAIEngine

llm_api_eval = OpenAIEngine("llama-3.1-8b-instruct")
llm_api_eval.client.base_url = userdata.get("SCW_BASE_URL")

llm_api_test = OpenAIEngine("llama-3.1-8b-instruct")
llm_api_test.client.base_url = userdata.get("SCW_BASE_URL")

In [8]:
#@title Setup Rate Limiting for Scaleway

import httpx

from httpx_retry import RetryTransport, RetryPolicy

retry_policy = (
    RetryPolicy()
      .with_attempts(3)
      .with_min_delay(100)
      .with_multiplier(2)
      .with_retry_on(lambda code: code == 429)
)

llm_api_eval.client.http_client = httpx.AsyncClient(
    transport=RetryTransport(policy=retry_policy),
)
llm_api_test.client.http_client = httpx.AsyncClient(
    transport=RetryTransport(policy=retry_policy),
)

In [9]:
#@title Get Dataset

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the CSV file
df = pd.read_csv('https://raw.githubusercontent.com/refugies-info/genai-for-public-good/refs/heads/main/data/ri_annotated_texts_final.csv')

X = df['Version initiale']  # features
y = df['Version retraitée']  # target

# First split: separate out the test set (80% train+val, 20% test)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Second split: split the remaining data into train and validation sets
# 0.25 test_size on 80% of data = 20% validation set (0.25 * 0.8 = 0.2)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Print the shapes of all sets
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Testing set shape: {X_test.shape}")

# Concatenate X_train and y_train to create the training set
train_set = list(zip(X_train, y_train))

# Concatenate X_test and y_test to create the testing set
test_set = list(zip(X_test, y_test))

# Concatenate X_val and y_val to create the validation set
val_set = list(zip(X_val, y_val))

# Load the data and the evaluation function
print("Train/Test Set Lengths: ", len(train_set), len(test_set))

Training set shape: (222,)
Validation set shape: (74,)
Testing set shape: (74,)
Train/Test Set Lengths:  222 74


In [10]:
#@title Setup Evaluation Function

set_seed(12)

tg.set_backward_engine(llm_api_eval, override=True)

# Define the evaluation function
def string_based_equality_fn(prediction: tg.Variable, target: tg.Variable):
  return int(str(prediction.value) == str(target.value))


from textgrad.autograd.string_based_ops import StringBasedFunction

eval_fn = StringBasedFunction(string_based_equality_fn, "Evaluate if the generated text is equal to the original text")


This is the system prompt we are going to start from:

In [11]:
#@title Setup Starting System Prompt

STARTING_SYSTEM_PROMPT = """
Tu es un assistant à la rédaction en langage clair. Tu peux transformer des textes pour qu'ils soient facilement compréhensibles
par les réfugiés en France qui ne connaissent pas bien le français ni l'organisation de l'administration française.
"""

print(STARTING_SYSTEM_PROMPT)



Tu es un assistant à la rédaction en langage clair. Tu peux transformer des textes pour qu'ils soient facilement compréhensibles
par les réfugiés en France qui ne connaissent pas bien le français ni l'organisation de l'administration française.



In [12]:
#@title Test Zero Shot Performance

from textgrad.tasks import DataLoader

train_loader = DataLoader(train_set, batch_size=3, shuffle=True)


# Testing the 0-shot performance of the evaluation engine
system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,
                            requires_grad=True,
                            role_description="system prompt to the language model")
model_evaluation = tg.BlackboxLLM(llm_api_eval, system_prompt)

system_prompt = tg.Variable(STARTING_SYSTEM_PROMPT,
                            requires_grad=True,
                            role_description="structured system prompt to a somewhat capable language model that specifies the behavior and strategies for the text simplifcation task")
model = tg.BlackboxLLM(llm_api_test, system_prompt)

optimizer = tg.TextualGradientDescent(engine=llm_api_eval, parameters=[system_prompt])

results = {"test_acc": [], "prompt": [], "validation_acc": []}
results["test_acc"].append(eval_dataset(test_set, eval_fn, model))
results["validation_acc"].append(eval_dataset(val_set, eval_fn, model))
results["prompt"].append(system_prompt.get_value())


  0%|          | 0/74 [00:00<?, ?it/s]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   1%|▏         | 1/74 [00:03<03:52,  3.19s/it]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   3%|▎         | 2/74 [00:05<02:51,  2.38s/it]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   4%|▍         | 3/74 [00:05<02:03,  1.73s/it]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   5%|▌         | 4/74 [00:07<01:47,  1.54s/it]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   7%|▋         | 5/74 [00:07<01:17,  1.12s/it]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   8%|▊         | 6/74 [00:09<01:27,  1.29s/it]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
Accuracy: 0.0:   9%|▉         | 7/74 [00:10<01:20,  1.20s/it]INFO:textgrad:LLMCall f

In [None]:
for epoch in range(3):
    for steps, (batch_x, batch_y) in enumerate((pbar := tqdm(train_loader, position=0))):
        pbar.set_description(f"Training step {steps}. Epoch {epoch}")
        optimizer.zero_grad()
        losses = []
        for (x, y) in zip(batch_x, batch_y):
            x = tg.Variable(x, requires_grad=False, role_description="text to be clarified for readability and ease of understanding")
            y = tg.Variable(y, requires_grad=False, role_description="clear, easy to understand text")
            response = model(x)
            eval_output_variable = eval_fn(inputs=dict(prediction=response, target=y))
            losses.append(eval_output_variable)
        total_loss = tg.sum(losses)
        total_loss.backward()
        optimizer.step()

        run_validation_revert(system_prompt, results, model, eval_fn, val_set)

        print("sys prompt: ", system_prompt)
        test_acc = eval_dataset(test_set, eval_fn, model)
        results["test_acc"].append(test_acc)
        results["prompt"].append(system_prompt.get_value())
        if steps == 3:
            break

Training step 0. Epoch 0: : 0it [00:00, ?it/s]INFO:textgrad:LLMCall function forward
INFO:textgrad:StringBasedFunction
