### Read all data

In [1]:
import pandas as pd

benchmarks = ["hh_bench", "ihap_bench", "mt_bench", "vicuna_bench"]
dfs = {}

for bench in benchmarks:
    dfs[bench] = pd.read_json(f"{bench}/question.jsonl", lines=True)

### Train-test split

In [17]:
from sklearn.model_selection import train_test_split

train_dfs, test_dfs = {}, {}

for bench in benchmarks:
    train_dfs[bench], test_dfs[bench] = train_test_split(dfs[bench], test_size=0.2)
    train_dfs[bench]['bench'] = bench
    test_dfs[bench]['bench'] = bench

combined_train_df = pd.concat([train_dfs[bench] for bench in benchmarks], axis=0).drop('category', axis=1).drop('reference', axis=1).reset_index()
combined_test_df = pd.concat([test_dfs[bench] for bench in benchmarks], axis=0).drop('category', axis=1).drop('reference', axis=1).reset_index()
# combined_train_df.to_json('train_prompts.jsonl', orient='records', lines=True)
# combined_test_df.to_json('test_prompts.jsonl', orient='records', lines=True)

### Identify time-criticality and complexity scores

In [40]:
%env OPENAI_API_KEY=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
from tqdm import tqdm
from string import Template
from openai import OpenAI
import json
client = OpenAI()

res_string = r"the instruction complexity would be around (.*) and time-criticality would be around (.*)"

prompt = Template('Generate complexity and time-criticality scores between 0 and 1 for the task "$x". \
                  A task such as “give me some game suggestions” would have time-criticality score of 0 while “how do I treat my wasp sting immediately” would have a time-criticality score of 1. \
                  A task such as “tell me how to heat milk” would have a complexity score of 0 while “tell me how to design an aircraft” would have a complexity score of 1.\
                  Output in a value between 0 and 1 with the keys "instruction_complexity" and "time_criticality".')

def add_complexity_time_criticality(df):
  complexity = []; time_criticality = []
  for index, row in tqdm(df.iterrows(), ncols=80, total=len(df)):
    response = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        response_format={ "type": "json_object" },
        messages=[
          {"role": "system", "content": "You are an assistant that can analyze input instructions and designed to output JSON."},
          {"role": "user", "content": prompt.substitute({'x': row['turns']})},
        ]
    )
    res = json.loads(response.choices[0].message.content)
    complexity.append(res['instruction_complexity'])
    time_criticality.append(res['time_criticality'])
  df['complexity'] = complexity
  df['time_criticality'] = time_criticality
  return df

combined_test_df  = add_complexity_time_criticality(combined_test_df)
combined_test_df.to_json('test.jsonl', orient='records', lines=True)
combined_train_df = add_complexity_time_criticality(combined_train_df)
combined_train_df.to_json('train.jsonl', orient='records', lines=True)

env: OPENAI_API_KEY=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


100%|███████████████████████████████████████████| 72/72 [02:26<00:00,  2.03s/it]
100%|█████████████████████████████████████████| 288/288 [09:23<00:00,  1.96s/it]
