In [1]:
!pip install pandas
!pip install polars
!pip install tqdm
!pip install faiss-cpu
!pip install scikit-learn



In [2]:
import pandas as pd

label_intent_df = pd.read_csv("toy_conversations.csv")
print(len(label_intent_df))
label_intent_df.head()

239


Unnamed: 0,conversation_number,conversation,index,intent
0,0,"customer: Hi, I'm having trouble with my loyal...",0,report_problem
1,0,assistant: Hello! Sorry to hear that. Can you ...,1,ask_clarification
2,0,customer: The card isn't being recognized at c...,2,describe_issue
3,0,"assistant: I understand, that must be frustrat...",3,suggest_troubleshooting
4,0,"customer: Yes, I tried both. Still doesn't work.",4,confirm_attempted_solution


Lets test GIFA with the toy dataset. We already ran it on the toy dataset with GPT-4o and ADA-003. The ADA-003 embeddings can be found in embeddings.csv

In [3]:
from GIFA.generate_intent_full import IntentGenerator
from GPFA.update_prompt import IntentPromptUpdater

dynamic_prompt = '''
                  You are analyzing a customer service conversation to identify the underlying intentions behind each message. 
                  Assign a structured intent label to each message.  

                  - Use an existing intent label if it already covers the message.  
                  - Create a new intent only if the message introduces a distinct issue not covered by existing intents.  
                 '''

updater = IntentPromptUpdater(dynamic_prompt)
static_prompt = updater.create_base_static_prompt()

conv_df = label_intent_df.drop("intent", axis=1)

api_time = 3 # Time between API calls
num_samples = 25 # Number of available samples (conversations)
sampling_method = "example" # Search method for finding relevant intents for a new conversation
top_k = 2 # Amount of relevant intents retrieved per new message in a new conversation (these are added to the prompt)
top_n = 1 # Amount of examples added to each intent name within the prompts (only when sampling method = example!)
search = True # Specify if it should search for relevant intents or add just every intent found yet

generator = IntentGenerator(static_prompt, api_time=api_time, n_samples=num_samples, method = sampling_method,
                                     top_k=top_k, top_n=top_n, search=True, show_tqdm=True, dummy_mode=True, dummy_file="toy_intents.csv")
full_intent_df, unique_intents = generator.run_intent_generation(conv_df)
full_intent_df.head()

2025-09-05 12:44:17,524 [INFO] ✅ IntentGenerator initialized
2025-09-05 12:44:17,526 [INFO] 🚀 Starting intent generation...
2025-09-05 12:44:17,528 [INFO] 📂 Dummy mode: loading saved intents from toy_intents.csv


index,intent,count,messages
i64,str,i64,str
0,"""report_loyalty_card_issue""",0,"""customer: Hi, I'm having troub…"
1,"""request_issue_details""",1,"""assistant: Hello! Sorry to hea…"
2,"""explain_issue""",2,"""customer: The card isn't being…"
3,"""suggest_troubleshooting""",3,"""assistant: I understand, that …"
4,"""confirm_troubleshooting_attemp…",4,"""customer: Yes, I tried both. S…"


In [4]:
from GPFA.evaluate_intent import encode_intent_labels, merge_intent_dfs
from GPFA.evaluate_intent import IntentClusteringEvaluator

df_merged = merge_intent_dfs(label_intent_df, full_intent_df)
true_clusters, predicted_clusters = encode_intent_labels(df_merged)
evaluator = IntentClusteringEvaluator(true_clusters, predicted_clusters, list(df_merged["message"]), 
                                    list(df_merged["intent_true"]), list(df_merged["intent_predicted"]))
results = evaluator.evaluate()

print(results)

{'Clustering Metrics': {'Purity': np.float64(0.5591836734693878), 'NCA': np.float64(0.8609629449001823), 'AMI': 0.41658768978669514, 'pairwise_precision': 0.29203084832904885, 'pairwise_recall': 0.6147186147186147, 'pairwise_F1': 0.39595677936563256, 'ARI': 0.36953093382976276, 'NMI': 0.7795144814955736, 'FMI': 0.42369422764528525}, 'Errors': {'Interesting Examples': [('Message: assistant: Alright. Have a nice day! AND assistant: You’re very welcome! should have the same intent: close_conversation', 'false_split'), ('Message: customer: It’s 88234. AND customer: Between 6 and 8 pm. should have had seperate intents: provide_receipt_number AND provide_new_timeslot', 'false_merge'), ('Message: assistant: Would you like me to email you a copy once it’s ready? AND assistant: That’s why it didn’t work. Would you like me to issue you a different voucher? should have the same intent: offer_solution', 'false_split'), ('Message: assistant: Great to hear! Anything else I can help with? AND custome

In [5]:
from GPFA.greedy_prompt_finding import greedy_prompt_optimizer

type = "random" # Baseline type random or greedy
train_df = conv_df # Can have a train set and validation set. Train is used for feedback.
val_df = conv_df
population_size = 10 # Maximum amount of prompts to be tested
R_max = 25 # Maximum amount of conversations to be evaluated on
method = "name" # Sampling method
init_temperature = 0.7 # Temperature used for feedback application
top_k = 2 # Intent search depth
lagged_feedback = True # True: first call the LLM to create feedback, then in a second call apply this feedback to the prompt, False: direct
show_tqdm = True # Show progress bar

rpo = greedy_prompt_optimizer(dynamic_prompt=dynamic_prompt, train_df=train_df, val_df=val_df, label_intent_df=label_intent_df, population_size=population_size,
                              num_samples=R_max, method=method, top_k=top_k, top_n=2, init_temperature=init_temperature, lagged_feedback=lagged_feedback, type=type,
                              show_tqdm=show_tqdm, dummy_mode=True, dummy_folder="rpo_toy")
history = rpo.run_gpo()

########################################
###### Currently at Iteration 0 ######
########################################
Updated Prompt: 
                  You are analyzing a customer service conversation to identify the underlying intentions behind each message. 
                  Assign a structured intent label to each message.  

                  - Use an existing intent label if it already covers the message.  
                  - Create a new intent only if the message introduces a distinct issue not covered by existing intents.  
                 
########################################
###### Currently at Iteration 1 ######
########################################
Updated Prompt: You are analyzing a customer service conversation to identify the underlying intentions behind each message. Assign a structured intent label to each message, using the following guidelines:

1. **Intent Definitions**: Clearly defined intent labels with examples are provided. For instance, "express_

In [6]:
history["ami_x_nca"].max()

np.float64(0.5266810320159521)

In [7]:
type = "greedy"


rpo = greedy_prompt_optimizer(dynamic_prompt=dynamic_prompt, train_df=train_df, val_df=val_df, label_intent_df=label_intent_df, population_size=population_size,
                              num_samples=R_max, method=method, top_k=top_k, top_n=2, init_temperature=init_temperature, lagged_feedback=lagged_feedback, type=type,
                              show_tqdm=show_tqdm, dummy_mode=True, dummy_folder="gpo_toy")
history = rpo.run_gpo()

########################################
###### Currently at Iteration 0 ######
########################################
Updated Prompt: 
                  You are analyzing a customer service conversation to identify the underlying intentions behind each message. 
                  Assign a structured intent label to each message.  

                  - Use an existing intent label if it already covers the message.  
                  - Create a new intent only if the message introduces a distinct issue not covered by existing intents.  
                 
########################################
###### Currently at Iteration 1 ######
########################################
Updated Prompt: You are analyzing a customer service conversation to identify the underlying intentions behind each message. Assign a structured intent label to each message, ensuring clarity, consistency, and contextual analysis. 

- Use an existing intent label if it already covers the message.
- Create a new int

In [8]:
history["ami_x_nca"].max()

np.float64(0.5600614241910931)

In [9]:
from GPFA.see_algorithm import SEE

population_size = 5 # Amount of prompts in each phase
train_df = conv_df 
val_df = conv_df
val_samples = 25 
K1 = 2 # Amount of feedback rounds
K2 = 0 # Amount of crossover rounds

# K2 = 2
crossover_types = ["ADA", "error"] # Crossover round types, either semantic-wise (ADA) or error-wise (error)

see = SEE(train_df, val_df, label_intent_df, population_size=population_size, method = method, 
                                    search_method = "semantic", top_n = 2, top_k = 2, init_temperature=0.7, 
                                    val_samples=val_samples, K1=K1, feedback_samples=15, K2=K2, crossover_types = crossover_types,
                                    lagged_feedback=lagged_feedback, show_tqdm=True, dummy_mode=True, dummy_folder="see_toy")
history = see.run()

Feedback loop: 0
Feedback loop: 1


In [10]:
see.history["ami_x_nca"].max()

np.float64(0.39456665800728324)

In [11]:
from GPFA.hyperband_scheduler import find_general_hyperband_schedule, time_to_budget

eta_values = [2, 3, 5] # Set halving factors to be tested
hours = 1 # Set time that can be used as budget 11
B = time_to_budget(hours) # Calculate the amount of total evaluations that can be done with the time budget
R_min = 4 # Set minimum amount of evaluation samples 10
R_max = 25 # Set maximum amount of evaluation samples
n_min = 1 # Set minimum amount of prompts needed to start PhaseEvo 4

# Create the schedules for the chosen hyperparameters
schedules = find_general_hyperband_schedule(B, eta_values, R_min, R_max, n_min)
schedules

[{'eta': 2,
  'rounds': 3,
  'x_list': [8.0, 4.0, 2.0],
  'R_list': [4, 8, 16],
  'multiplicity': [3, 2, 1],
  'total_cost': 768.0},
 {'eta': 3,
  'rounds': 2,
  'x_list': [18.0, 6.0],
  'R_list': [4, 12],
  'multiplicity': [2, 1],
  'total_cost': 864.0},
 {'eta': 5,
  'rounds': 2,
  'x_list': [15.0, 3.0],
  'R_list': [4, 20],
  'multiplicity': [2, 1],
  'total_cost': 720.0}]

In [13]:
from GPFA.hyperband_see import hb_see

hyp = hb_see(train_df, val_df, label_intent_df, schedules[0], method=method, search_method="semantic",
                      top_k=top_k, init_temperature=init_temperature, feedback_samples=15, K1=2, K2=0,
                      crossover_type=crossover_types, lagged_feedback=lagged_feedback, show_tqdm=show_tqdm, filename=None,
                      dummy_mode=True, dummy_root="hyperband_see")

history = hyp.run_hyperband()

Bracket 1/3: 100%|██████████| 3/3 [00:00<00:00, 11.06it/s]


Finished Hyperband (bracket 1, round 1) in 0.1s | pop=8, val_samples=4
Finished Hyperband (bracket 1, round 2) in 0.1s | pop=4, val_samples=8
Finished Hyperband (bracket 1, round 3) in 0.1s | pop=2, val_samples=16


Bracket 2/3: 100%|██████████| 2/2 [00:00<00:00, 11.87it/s]


Finished Hyperband (bracket 2, round 1) in 0.1s | pop=4, val_samples=8
Finished Hyperband (bracket 2, round 2) in 0.1s | pop=2, val_samples=16


Bracket 3/3: 100%|██████████| 1/1 [00:00<00:00, 11.80it/s]

Finished Hyperband (bracket 3, round 1) in 0.1s | pop=2, val_samples=16





In [14]:
history[history["round"] == 3]["ami_x_nca"].max()

np.float64(0.5906197155356346)