In [45]:
import pandas as pd
import os
import json
import random
random.seed(42)

In [56]:
# datasets = ["flancot_filtered_15k", "alpaca-cleaned"]
datasets = ["alpaca-cleaned"]
# datasets = ["flancot_filtered_15k"]

models = [
    "gpt-3.5-turbo",
    "gpt-4-turbo",
    "c4ai-command-r-plus",
    "c4ai-command-r-v01",
    # "Meta-Llama-3-8B-Instruct",
    "Meta-Llama-3-70B-Instruct",
]

suffix = "generate_instruct-refine"
results = []

for dataset in datasets:
    for model in models:
        if "flancot" in dataset:
            dataset_name = "flancot"
        else:
            dataset_name = "alpaca"

        print(f"Processing {dataset} {model}")
        file = f"{dataset_name}/{dataset}-{model}-{suffix}.jsonl"
        if not os.path.exists(file):
            print(f"Skipping {file}")
            continue
        with open(file) as f:
            lines = f.readlines()
            for line in lines:
                data = json.loads(line)
                if data['idx'] >= 100:
                    continue
                data["dataset"] = dataset
                data["model"] = model
                results.append(data)

df = pd.DataFrame(results)
df

Processing alpaca-cleaned gpt-3.5-turbo
Processing alpaca-cleaned gpt-4-turbo
Processing alpaca-cleaned c4ai-command-r-plus
Processing alpaca-cleaned c4ai-command-r-v01
Processing alpaca-cleaned Meta-Llama-3-70B-Instruct


Unnamed: 0,idx,input,instruction,completions,option,new_instruction,dataset,model,extracted_instruction,refine_instruction,extracted_refined_instruction
0,22,,"Determine which of the two words ""penchant"" an...","For the given instruction, a suitable adaptati...",D,"Determine which of the two words ""penchant"" an...",alpaca-cleaned,gpt-3.5-turbo,,,
1,43,Input: 'The sun was a golden coin in the sky.',Rewrite the sentence to reveal the metaphor.,"For the given instruction, a suitable adaptati...",D,Rewrite the sentence to reveal the metaphor.,alpaca-cleaned,gpt-3.5-turbo,,,
2,48,,Compare and contrast the US Constitution and t...,"For the given instruction, a suitable adaptati...",D,Compare and contrast the US Constitution and t...,alpaca-cleaned,gpt-3.5-turbo,,,
3,67,,What is the y-intercept of the line x - 2y = 5?,"For the given instruction, a suitable adaptati...",D,What is the y-intercept of the line x - 2y = 5?,alpaca-cleaned,gpt-3.5-turbo,,,
4,73,,Classify the following sentence as true or fal...,"For the given instruction, a suitable adaptati...",D,Classify the following sentence as true or fal...,alpaca-cleaned,gpt-3.5-turbo,,,
...,...,...,...,...,...,...,...,...,...,...,...
495,95,"Input: 'The last paragraph of the story reads,...",Determine the characters’ psychological states...,,D,Determine the characters’ psychological states...,alpaca-cleaned,Meta-Llama-3-70B-Instruct,,,
496,96,Input: 'The apple is a popular fruit.',Change the statement into a rhetorical questio...,,D,Change the statement into a rhetorical questio...,alpaca-cleaned,Meta-Llama-3-70B-Instruct,,,
497,97,,List 10 synonyms for 'sad'.,,D,List 10 synonyms for 'sad'.,alpaca-cleaned,Meta-Llama-3-70B-Instruct,,,
498,98,,Explain why an organization might use open sou...,,D,Explain why an organization might use open sou...,alpaca-cleaned,Meta-Llama-3-70B-Instruct,,,


In [58]:
# drop columns [input, dataset], groupby idx column and sample 30 unique idx
sampled_df = df.drop(columns=["dataset"])
# select unique 30 idx 
selected_idx = sampled_df["idx"].unique()
selected_idx = random.sample(list(selected_idx), 20)

sampled_df = sampled_df[sampled_df["idx"].isin(selected_idx)]
# set the idx column as categorical
sampled_df["idx"] = pd.Categorical(sampled_df["idx"])
# group by idx into multirow
sampled_df = sampled_df.set_index(["idx", "model"])
# sort by idx
sampled_df = sampled_df.sort_index()
# drop new_instruction column
sampled_df = sampled_df.drop(columns=["new_instruction"])
# swap the column order into ['instruction', 'option', 'completions', 'extracted_instruction', 'refine_instruction', 'extracted_refine_instruction']
sampled_df = sampled_df[['instruction', 'input', 'option', 'completions', 'extracted_instruction', 'refine_instruction', 'extracted_refined_instruction']]
sampled_df
# save to csv
# sampled_df.to_csv("sampled_results.csv")

Unnamed: 0_level_0,Unnamed: 1_level_0,instruction,input,option,completions,extracted_instruction,refine_instruction,extracted_refined_instruction
idx,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,Meta-Llama-3-70B-Instruct,Generate a three paragraph essay on the theme ...,,D,,,,
2,c4ai-command-r-plus,Generate a three paragraph essay on the theme ...,,B,"For the given instruction, a suitable adaptati...","""Brainstorm a list of potential benefits that ...",No\n#New Instruction#: Premise: The nave is de...,Premise: The nave is described as a wonder of ...
2,c4ai-command-r-v01,Generate a three paragraph essay on the theme ...,,B,"For the given instruction, a suitable adaptati...",“Brainstorm the benefits of waking up early an...,"Yes. \n\nNo. Answer: No, it does not. The new ...","New Instruction#: “Generate a well-structured,..."
2,gpt-3.5-turbo,Generate a three paragraph essay on the theme ...,,A,"For the given instruction, a suitable adaptati...","“Outline the key benefits of waking up early, ...",Answer: Yes,
2,gpt-4-turbo,Generate a three paragraph essay on the theme ...,,C,"For the given instruction, a suitable adaptati...","""Generate a three paragraph essay on the theme...","Answer: No\n#New Instruction#: ""Generate a thr...","""Generate a three paragraph essay on the theme..."
...,...,...,...,...,...,...,...,...
99,Meta-Llama-3-70B-Instruct,Rearrange the following words to make a cohere...,Input: 'The my lied dog sofa on',D,assistant,,,
99,c4ai-command-r-plus,Rearrange the following words to make a cohere...,Input: 'The my lied dog sofa on',D,"For the given instruction, a suitable adaptati...",,,
99,c4ai-command-r-v01,Rearrange the following words to make a cohere...,Input: 'The my lied dog sofa on',D,"For the given instruction, a suitable adaptati...",,,
99,gpt-3.5-turbo,Rearrange the following words to make a cohere...,Input: 'The my lied dog sofa on',A,"For the given instruction, a suitable adaptati...","“Identify the subject, verb, and key component...",Answer: Yes,


In [54]:
# randomly flip the model column within each idx
sampled_df = sampled_df.reset_index()
for idx in sampled_df["idx"].unique():
    sampled_df.loc[sampled_df["idx"] == idx, "model"] = sampled_df.loc[sampled_df["idx"] == idx, "model"].sample(frac=1).values
# sampled_df = sampled_df.set_index(["idx", "model"])
# sampled_df = sampled_df.sort_index()
sampled_df
# save to csv
sampled_df.to_csv(f"{dataset_name}-{suffix}-sampled.csv", index=False)

In [13]:
# measure the count of option between different models
df.groupby(["dataset", "model", "option"]).size().unstack(fill_value=0)

Unnamed: 0_level_0,option,A,B,C,D
dataset,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
flancot_filtered_15k,Meta-Llama-3-8B-Instruct,54,45,0,1
flancot_filtered_15k,c4ai-command-r-plus,10,3,6,81
flancot_filtered_15k,c4ai-command-r-v01,21,33,19,27
flancot_filtered_15k,gpt-3.5-turbo,34,16,35,15
flancot_filtered_15k,gpt-4-turbo,2,25,28,45
