# Calculate mean residuals

In [23]:
import pandas as pd

In [24]:
# Define a function to calculate the mean squared difference
# for each valence bucket between Actual and another source
def calculate_mean_squared_difference(data, classification_column, source1, source2):
    # Filter data for the two sources
    filtered_data = data[data['source'].isin([source1, source2])]

    # Pivot the data to make comparisons easier
    pivot_data = filtered_data.pivot_table(
        index=['transcript_id', classification_column],
        columns='source',
        values='proportion'
    ).reset_index()

    # Calculate the squared difference for the valence bucket proportions
    pivot_data['squared_difference'] = (pivot_data[source1] - pivot_data[source2]) ** 2

    # Group by classification_column and calculate the mean squared difference
    mean_squared_diff = pivot_data.groupby(classification_column)['squared_difference'].mean()

    return mean_squared_diff

## Valence

In [25]:
# Load the dataset
data = pd.read_csv('valence/combined_valence_bucketed_proportions.csv')
column = 'bucketed_valence'

In [26]:
# Calculate mean squared differences for GPT-4 and Actual
mean_squared_diff_gpt4_actual = calculate_mean_squared_difference(data, column, 'Actual', 'GPT-4')

# Calculate mean squared differences for LLaMA and Actual
mean_squared_diff_llama_actual = calculate_mean_squared_difference(data, column, 'Actual', 'LLaMA-3.1')

# Output the results
print("Mean squared differences between GPT-4 and Actual:")
print(mean_squared_diff_gpt4_actual)

print("\nMean squared differences between LLaMA and Actual:")
print(mean_squared_diff_llama_actual)



print(f"Average MSE GPT 4o: {mean_squared_diff_gpt4_actual.mean()}")
print(f"Average MSE Llama: {mean_squared_diff_llama_actual.mean()}")

Mean squared differences between GPT-4 and Actual:
bucketed_valence
Competitive    0.131936
Neutral        0.048372
Supportive     0.031794
Name: squared_difference, dtype: float64

Mean squared differences between LLaMA and Actual:
bucketed_valence
Competitive    0.088579
Neutral        0.084086
Supportive     0.038137
Name: squared_difference, dtype: float64
Average MSE GPT 4o: 0.07070076069786511
Average MSE Llama: 0.07026746035001497


## LegalBench

In [27]:
# Load the dataset
data = pd.read_csv('legalbench/combined_legalbench_proportions.csv')
column = 'legalbench'

In [28]:
# Calculate mean squared differences for GPT-4 and Actual
mean_squared_diff_gpt4_actual = calculate_mean_squared_difference(data, column, 'Actual', 'GPT-4')

# Calculate mean squared differences for LLaMA and Actual
mean_squared_diff_llama_actual = calculate_mean_squared_difference(data, column, 'Actual', 'LLaMA-3.1')

# Output the results
print("Mean squared differences between GPT-4 and Actual:")
print(mean_squared_diff_gpt4_actual)

print("\nMean squared differences between LLaMA and Actual:")
print(mean_squared_diff_llama_actual)

print(f"Average MSE GPT 4o: {mean_squared_diff_gpt4_actual.mean()}")
print(f"Average MSE Llama: {mean_squared_diff_llama_actual.mean()}")

Mean squared differences between GPT-4 and Actual:
legalbench
Background       0.035719
Clarification    0.074027
Criticism        0.130575
Implications     0.019283
Name: squared_difference, dtype: float64

Mean squared differences between LLaMA and Actual:
legalbench
Background       0.015053
Clarification    0.105719
Criticism        0.055687
Implications     0.016493
Name: squared_difference, dtype: float64
Average MSE GPT 4o: 0.06490091062860437
Average MSE Llama: 0.04823799220952987


## MetaCog

In [29]:
# Load the dataset
data = pd.read_csv('metacog/classification/combined_metacog_proportions.csv')
column = 'metacog'

In [30]:
# Calculate mean squared differences for GPT-4 and Actual
mean_squared_diff_gpt4_actual = calculate_mean_squared_difference(data, column, 'Actual', 'GPT-4')

# Calculate mean squared differences for LLaMA and Actual
mean_squared_diff_llama_actual = calculate_mean_squared_difference(data, column, 'Actual', 'LLaMA-3.1')

# Output the results
print("Mean squared differences between GPT-4 and Actual:")
print(mean_squared_diff_gpt4_actual)

print("\nMean squared differences between LLaMA and Actual:")
print(mean_squared_diff_llama_actual)

print(f"Average MSE GPT 4o: {mean_squared_diff_gpt4_actual.mean()}")
print(f"Average MSE Llama: {mean_squared_diff_llama_actual.mean()}")

Mean squared differences between GPT-4 and Actual:
metacog
argumentation_and_clarification    0.137060
case_facts_and_context             0.041117
constitutional_issues                   NaN
judicial_role_and_review           0.149316
precedent_and_doctrine             0.265134
procedural_matters                 0.000258
statutory_interpretation           0.056040
Name: squared_difference, dtype: float64

Mean squared differences between LLaMA and Actual:
metacog
argumentation_and_clarification    0.118573
case_facts_and_context             0.054962
constitutional_issues                   NaN
judicial_role_and_review           0.076578
precedent_and_doctrine             0.240139
procedural_matters                      NaN
statutory_interpretation           0.050298
Name: squared_difference, dtype: float64
Average MSE GPT 4o: 0.10815407769891335
Average MSE Llama: 0.10810985587244892
