In [1]:
LOG_PATH = '/kaggle/input/rlprompt-results-rio/all_outputs_sst5.log'
print("Don't forget to change the dataset argument")

Don't forget to change the dataset argument


In [2]:
import pandas as pd

## Experiments Data Getter: Validation

In [3]:
import re

log_file = LOG_PATH

step_pattern = re.compile(r'^(\d+)\s+\|')
reward_pattern = re.compile(r'([-+]?\d*\.\d+|\d+)')
accuracy_pattern = re.compile(r'Accuracy:\s*([-+]?\d*\.\d+|\d+)')

steps = []
rewards = []
accuracy = []
prompts = []

with open(log_file, 'r') as f:
    lines = f.readlines()

current_step = None

for i, line in enumerate(lines):
    step_match = step_pattern.match(line)
    if step_match:
        current_step = int(step_match.group(1))

    if "Our Prompt:" in line:
        next_line = lines[i + 1].strip()
        prev_line = lines[i - 1].strip()

        accuracy_match = accuracy_pattern.search(prev_line)
        numbers = reward_pattern.findall(next_line)

        if accuracy_match and numbers and current_step is not None:
            acc_value = float(accuracy_match.group(1))
            reward_value = float(numbers[-1])

            steps.append(current_step)
            rewards.append(reward_value)
            accuracy.append(acc_value)
            prompts.append(next_line)

In [4]:
INTERVAL = 600

starts = 0
end = starts + INTERVAL

seed_best_prompts = []

validation_dfs = {}

while end <= len(steps):
    interval_acc = accuracy[starts:end]
    interval_prompts = prompts[starts:end]
    interval_steps = steps[starts:end]
    interval_rewards = rewards[starts:end]
    seed = starts // INTERVAL

    max_idx = interval_acc.index(max(interval_acc))
    best_prompt = interval_prompts[max_idx]
    seed_best_prompts.append((seed, max(interval_acc), best_prompt))

    values = {
        'step': interval_steps,
        'accuracy': interval_acc,
        'rewards': interval_rewards,
        'prompts': interval_prompts,
    }

    df = pd.DataFrame(values)
    validation_dfs[seed] = df
    df.to_csv(f"validation_results_{seed}.csv",index=False)

    starts = end
    end = starts + INTERVAL

In [5]:
# Print best prompt (highest accuracy) for each seed
for seed, acc, prompt in seed_best_prompts:
    print(f"Seed {seed}: Max Accuracy = {acc:.4f}")
    print(f"Prompt: {prompt}")
    print("-" * 50)

Seed 0: Max Accuracy = 0.4875
Prompt: ['Switchistry'] -1.0927900075912476
--------------------------------------------------
Seed 1: Max Accuracy = 0.4875
Prompt: ['Bot animation'] -0.5068687796592712
--------------------------------------------------
Seed 2: Max Accuracy = 0.4000
Prompt: ['Cooldown proportions'] -8.97808837890625
--------------------------------------------------


In [6]:
import ast

best_prompt_words = []

for _, _, raw_prompt in seed_best_prompts:
    try:
        bracket_content = raw_prompt.split(']')[0] + ']'
        prompt_list = ast.literal_eval(bracket_content)
        if isinstance(prompt_list, list) and len(prompt_list) > 0:
            best_prompt_words.append(prompt_list[0])
    except:
        continue

print(best_prompt_words)

['Switchistry', 'Bot animation', 'Cooldown proportions']


## Experiments Data Getter: Training

In [7]:
import numpy as np

log_file = LOG_PATH

all_rewards = {}
all_accuracy = {}

header_pattern = re.compile(r'^(\d+)\s*\|')
reward_pattern = re.compile(r'Reward:\s*([-+]?\d*\.\d+|\d+)')
accuracy_pattern = re.compile(r'Accuracy:\s*([-+]?\d*\.\d+|\d+)')

with open(log_file, 'r') as f:
    lines = f.readlines()

current_step = None
seed = -1

for line in lines:
    if 'Task LM' in line:
        seed += 1
        all_rewards[seed] = {}
        all_accuracy[seed] = {}
        continue
    
    header_match = header_pattern.match(line)
    if header_match:
        current_step = int(header_match.group(1))
    
    if "Reward:" in line:
        reward_match = reward_pattern.search(line)
        if reward_match:
            reward_value = float(reward_match.group(1))
            if current_step is not None:
                all_rewards[seed].setdefault(current_step, []).append(reward_value)

    if "Accuracy:" in line:
        accuracy_match = accuracy_pattern.search(line)
        if accuracy_match:
            accuracy_value = float(accuracy_match.group(1))
            if current_step is not None:
                all_accuracy[seed].setdefault(current_step, []).append(accuracy_value)

    if 'Total training time' in line:
        print(line)

    if 'Peak GPU' in line:
        print(line)

In [8]:
training_results_dfs = {}

for seed, data in all_rewards.items():
    steps = sorted(data.keys())
    means = [np.mean(data[step]) for step in steps]
    stds  = [np.std(data[step]) for step in steps]

    values = {
        'step': steps,
        'reward_mean': means,
        'reward_std': stds
    }

    df = pd.DataFrame(values)
    training_results_dfs[seed] = df

In [9]:
for seed, data in all_accuracy.items():
    steps = sorted(data.keys())
    means = [np.mean(data[step]) for step in steps]
    stds  = [np.std(data[step]) for step in steps]

    df = training_results_dfs[seed]
    df['accuracy_mean'] = means
    df['accuracy_stds'] = stds
    df.to_csv(f"training_results_{seed}.csv",index=False)

## Environment Prep

In [10]:
from IPython.display import clear_output

In [11]:
root_dir = "/kaggle/conda"
!mkdir -p $root_dir
!wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
!bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f
clear_output()

In [12]:
!$root_dir/miniconda3/bin/conda create --name my_env python=3.10 -y
clear_output()

In [13]:
!git clone https://github.com/malifalhakim/rl-prompt.git

Cloning into 'rl-prompt'...
remote: Enumerating objects: 1813, done.[K
remote: Counting objects: 100% (661/661), done.[K
remote: Compressing objects: 100% (385/385), done.[K
remote: Total 1813 (delta 381), reused 466 (delta 273), pack-reused 1152 (from 1)[K
Receiving objects: 100% (1813/1813), 83.59 MiB | 28.55 MiB/s, done.
Resolving deltas: 100% (790/790), done.
Updating files: 100% (291/291), done.


In [14]:
%cd /kaggle/working/rl-prompt

/kaggle/working/rl-prompt


In [15]:
# Set the CUDA version variable
CUDA_VERSION = 'cu118'

# 1. Install PyTorch with matching CUDA version
!source $root_dir/miniconda3/bin/activate my_env; pip install torch==2.6.0+{CUDA_VERSION} \
--extra-index-url https://download.pytorch.org/whl/{CUDA_VERSION}

clear_output()

In [16]:
!source $root_dir/miniconda3/bin/activate my_env; pip install -e .

clear_output()

## Evaluation

In [17]:
%cd examples/few-shot-classification/evaluation

/kaggle/working/rl-prompt/examples/few-shot-classification/evaluation


In [18]:
with open("prompts.txt", "w") as f:
    for prompt in best_prompt_words:
        f.write(prompt + "\n")

In [19]:
%%bash
source /kaggle/conda/miniconda3/bin/activate my_env

while IFS= read -r prompt; do
    echo "----------------------------------------------"
    echo "Running evaluation with prompt: $prompt"
    python run_eval.py dataset=sst-5 task_lm=distilroberta-base "prompt=\"${prompt}\""
    python run_eval.py dataset=sst-5 task_lm=roberta-base "prompt=\"${prompt}\""
    python run_eval.py dataset=sst-5 task_lm=roberta-large "prompt=\"${prompt}\""
done < prompts.txt

----------------------------------------------
Running evaluation with prompt: Switchistry
[31mnum_shots: 16
base_path: ../data
dataset: sst-5
dataset_seed: 0
task_lm: distilroberta-base
is_mask_lm: null
prompt: Switchistry
[0m
Test Size 2210
Examples: {'source_texts': ['no movement , no yuks , not much of anything .', "a gob of drivel so sickly sweet , even the eager consumers of moore 's pasteurized ditties will retch it up like rancid crème brûlée .", "` how many more voyages can this limping but dearly-loved franchise survive ? '", 'so relentlessly wholesome it made me want to swipe something .', 'gangs of new york is an unapologetic mess , whose only saving grace is that it ends by blowing just about everything up .'], 'class_labels': [1, 0, 2, 2, 0]}
Task LM: distilroberta-base
[31mprompt: Switchistry, accuracy: 0.3235294222831726[0m
[31mnum_shots: 16
base_path: ../data
dataset: sst-5
dataset_seed: 0
task_lm: roberta-base
is_mask_lm: null
prompt: Switchistry
[0m
Test Size 2

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For b