# Notebook for the results of the o1 cladder evaluation

## Helper functions

In [22]:
def compute_accuracy(df):
    # compute accuracy (cases where parsed_result_answer matches answer)
    accuracy = (df.parsed_result_answer == df.answer).mean()
    print(f"Overall accuracy: {accuracy*100:.2f}%")


    # Compute accuracy by rung
    rung_accuracy = df.groupby('rung')['parsed_result_answer'].apply(
        lambda x: (x == df.loc[x.index, 'answer']).mean() * 100
    )
    print("\nAccuracy by rung:")
    print(rung_accuracy.apply(lambda x: f"{x:.2f}%"))


    # Compute accuracy by anticommonsense
    # Create binary indicator for anticommonsense
    df['has_anticommonsense'] = df['anticommonsense'].notna()
    anticommonsense_accuracy = df.groupby('has_anticommonsense')['parsed_result_answer'].apply(
        lambda x: (x == df.loc[x.index, 'answer']).mean() * 100
    )
    print("\nAccuracy by anticommonsense:")
    print(anticommonsense_accuracy.apply(lambda x: f"{x:.2f}%"))

    # Compute accuracy by nonsense
    nonsense_accuracy = df.groupby('nonsense')['parsed_result_answer'].apply(
        lambda x: (x == df.loc[x.index, 'answer']).mean() * 100
    )
    print("\nAccuracy by nonsense:")
    print(nonsense_accuracy.apply(lambda x: f"{x:.2f}%"))

In [None]:
import pandas as pd

def load_data_metadata(): 
    root_path = '../../2304_caubench/'
    meta_models = "data/cladder-v1/cladder-v1-meta-models.json"


    meta_models = pd.read_json(root_path + meta_models)

    data = pd.read_json(root_path + "data/cladder-v1/o1-preview-data-cladder-v1-q-balanced_rand-top1000.json", lines=True)


    # Extract background from meta dictionary and create new column
    if 'background' in data['meta'].iloc[0]:
        data['background'] = data['meta'].apply(lambda x: x.get('background'))

    if "rung" in data['meta'].iloc[0]:
        data['rung'] = data['meta'].apply(lambda x: x.get('rung'))



    meta_models = meta_models[['model_id',  'anticommonsense', 'nonsense']]

    data = data.merge(meta_models, on='model_id', how='left')

    return data

## o1-preview

In [36]:
# load results-o1-preview-2024-09-12-o1-preview-data-cladder-v1-q-balanced_rand-top1000.jsonl


df = pd.read_json('results-o1-preview-2024-09-12-o1-preview-data-cladder-v1-q-balanced_rand-top1000.jsonl', lines=True)

df = df[df['raw_inference_result'].notna() & (df['raw_inference_result'] != '')]
df = df.drop_duplicates(subset='question_id', keep='first')


data = load_data_metadata()


merged = pd.merge(df, data, on='question_id', how='left')



# change No to no, Yes to yes and true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'Yes' else 'no' if x == 'No' else x)

# true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'true' else 'no' if x == 'False' else x)

assert len(merged.parsed_result_answer.value_counts().keys()) == 2, "parsed_result_answer should have only two values: yes and no"

# compute accuracy
compute_accuracy(merged)


Overall accuracy: 86.50%

Accuracy by rung:
rung
1    95.72%
2    83.53%
3    80.48%
Name: parsed_result_answer, dtype: object

Accuracy by anticommonsense:
has_anticommonsense
False    85.20%
True     89.39%
Name: parsed_result_answer, dtype: object

Accuracy by nonsense:
nonsense
1.0    84.64%
Name: parsed_result_answer, dtype: object


## o1-mini no reasoning

In [38]:
import pandas as pd
df = pd.read_json('results-o1-mini-2024-09-12-o1-preview-data-cladder-v1-q-balanced_rand-top1000-no-reasoning.jsonl', lines=True)

df = df[df['raw_inference_result'].notna() & (df['raw_inference_result'] != '')]
df = df.drop_duplicates(subset='question_id', keep='first')

assert len(df) == 1000, "df should have 1000 rows"


data = load_data_metadata()
merged = pd.merge(df, data, on='question_id', how='left')

# change No to no, Yes to yes and true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'Yes' else 'no' if x == 'No' else x)

# true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'true' else 'no' if x == 'False' else x)

assert len(merged.parsed_result_answer.value_counts().keys()) == 2, "parsed_result_answer should have only two values: yes and no"
# compute accuracy
compute_accuracy(merged)



Overall accuracy: 88.20%

Accuracy by rung:
rung
1    96.33%
2    90.00%
3    78.38%
Name: parsed_result_answer, dtype: object

Accuracy by anticommonsense:
has_anticommonsense
False    86.79%
True     91.32%
Name: parsed_result_answer, dtype: object

Accuracy by nonsense:
nonsense
1.0    83.83%
Name: parsed_result_answer, dtype: object


## o1-mini causal CoT

In [39]:
import pandas as pd
df = pd.read_json('results-o1-mini-2024-09-12-o1-preview-data-cladder-v1-q-balanced_rand-top1000-causal-cot.jsonl', lines=True)

df = df[df['raw_inference_result'].notna() & (df['raw_inference_result'] != '')]
df = df.drop_duplicates(subset='question_id', keep='first')

assert len(df) == 1000, "df should have 1000 rows"


data = load_data_metadata()
merged = pd.merge(df, data, on='question_id', how='left')

# change No to no, Yes to yes and true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'Yes' else 'no' if x == 'No' else x)

# true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'true' else 'no' if x == 'False' else x)

assert len(merged.parsed_result_answer.value_counts().keys()) == 2, "parsed_result_answer should have only two values: yes and no"
# compute accuracy
compute_accuracy(merged)



Overall accuracy: 79.20%

Accuracy by rung:
rung
1    88.38%
2    76.18%
3    73.27%
Name: parsed_result_answer, dtype: object

Accuracy by anticommonsense:
has_anticommonsense
False    79.25%
True     79.10%
Name: parsed_result_answer, dtype: object

Accuracy by nonsense:
nonsense
1.0    78.44%
Name: parsed_result_answer, dtype: object


## o1-mini CoT


In [42]:
import pandas as pd
df = pd.read_json('results-o1-mini-2024-09-12-o1-preview-data-cladder-v1-q-balanced_rand-top1000.jsonl', lines=True)

df = df[df['raw_inference_result'].notna() & (df['raw_inference_result'] != '')]
df = df.drop_duplicates(subset='question_id', keep='first')

assert len(df) == 1000, "df should have 1000 rows"


data = load_data_metadata()
merged = pd.merge(df, data, on='question_id', how='left')

# change No to no, Yes to yes and true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'Yes' else 'no' if x == 'No' else x)

# true to yes, false to no
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'true' else 'no' if x == 'False' else x)

# Yes. to yes
merged.parsed_result_answer = merged.parsed_result_answer.apply(lambda x: 'yes' if x == 'Yes.' else x)

assert len(merged.parsed_result_answer.value_counts().keys()) == 2, "parsed_result_answer should have only two values: yes and no"
# compute accuracy
compute_accuracy(merged)



Overall accuracy: 87.10%

Accuracy by rung:
rung
1    94.80%
2    86.47%
3    80.18%
Name: parsed_result_answer, dtype: object

Accuracy by anticommonsense:
has_anticommonsense
False    86.65%
True     88.10%
Name: parsed_result_answer, dtype: object

Accuracy by nonsense:
nonsense
1.0    84.64%
Name: parsed_result_answer, dtype: object
