In [1]:
# TODO Correctly calculate biasedness (i.e., considering weights and using AggregationMetric) - see decision_analysis

In [1]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [2]:
# Folder containing the CSV files
folder_path = "decision_datasets"

# List to store dataframes
dataframes = []

# List to store the column names of each dataframe
columns_list = []

# Iterate over each file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):  # Only process CSV files
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)  # Load the CSV into a dataframe
        dataframes.append(df)  # Store the dataframe
        columns_list.append(set(df.columns))  # Store the columns as a set for comparison

# Find the common columns across all dataframes
common_columns = set.intersection(*columns_list)

# Filter each dataframe to only keep the common columns
filtered_dataframes = [df[list(common_columns)] for df in dataframes]

# Concatenate the filtered dataframes into one large dataframe
df = pd.concat(filtered_dataframes, ignore_index=True)

# Show the result
print("Combined dataframe shape:", df.shape)

Combined dataframe shape: (540000, 20)


In [3]:
df["bias"] = df["bias"].apply(
    lambda x: re.sub(r'([a-z])([A-Z])', r'\1 \2', x)
).replace({
    'Escalation Of Commitment': 'Escalation of Commitment', 
    'Illusion Of Control': 'Illusion of Control',
    'Self Serving Bias': 'Self-Serving Bias',
    'In Group Bias': 'In-Group Bias',
    'Status Quo Bias': 'Status-Quo Bias'
})

df["model"] = df["model"].replace({
    'meta-llama/Llama-3.2-90B-Vision-Instruct': 'Llama-3.2-90B',
    'meta-llama/Meta-Llama-3.1-8B-Instruct': 'Llama-3.1-8B',
    'meta-llama/Meta-Llama-3.1-70B-Instruct': 'Llama-3.1-70B',
    'gpt-4o-mini-2024-07-18': 'GPT-4o-mini',
    'gpt-3.5-turbo-0125': 'GPT-3.5-Turbo'
})


In [13]:
# build aggregated results table
df_results = df.pivot_table(values='individual_score', index='bias', columns='model', aggfunc='mean')
models = list(df_results.columns.values)

df_results.head()

model,GPT-3.5-Turbo,GPT-4o-mini,Llama-3.1-70B,Llama-3.1-8B,Qwen/Qwen2.5-72B-Instruct,accounts/fireworks/models/phi-3-vision-128k-instruct,accounts/yi-01-ai/models/yi-large,google/gemma-2-27b-it,google/gemma-2-9b-it,gpt-4o-2024-08-06,meta-llama/Llama-3.2-1B-Instruct,meta-llama/Llama-3.2-3B-Instruct,meta-llama/Meta-Llama-3.1-405B-Instruct,microsoft/WizardLM-2-7B,microsoft/WizardLM-2-8x22B,mistral-small-2409,models/gemini-1.5-flash,models/gemini-1.5-pro
bias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Anchoring,0.350827,0.402845,0.641535,0.459008,0.625407,0.43106,0.489181,0.327632,0.360206,0.604762,0.147727,0.482855,0.399653,-0.049464,0.372813,0.372611,0.291206,0.40575
Anthropomorphism,-0.086767,-0.1233,-0.140983,-0.02725,-0.007248,-0.26855,0.012133,-0.061183,-0.09325,-0.030867,-0.018345,-0.066367,-0.135429,-0.006154,-0.077439,-0.005567,-0.076517,-0.046733
Availability Heuristic,0.160017,0.160558,0.25356,0.106506,0.147028,0.016126,0.131289,0.110937,0.088508,0.13265,0.066667,0.021098,0.141534,0.10855,-0.098035,0.210219,0.102322,0.041048
Bandwagon Effect,0.796617,0.31845,0.084733,0.117535,0.5312,0.558983,0.559083,0.37135,0.071183,0.659533,0.038183,0.604567,0.337183,0.099447,0.121438,0.01435,0.188017,0.711133
Confirmation Bias,0.089613,0.043256,0.69275,0.02021,0.2975,0.015773,0.0,0.68896,0.718067,0.0046,0.044944,-0.184519,0.026844,-0.062277,0.340307,0.061679,0.0,0.0883


In [16]:
# load the meta data csv with the assignment of decision LLMs
df_metadata = pd.read_csv("biases_metadata.csv", encoding = "UTF-8",sep =";")
df_metadata["Bias"] = df_metadata["Bias"].str.title().replace({
    'Escalation Of Commitment': 'Escalation of Commitment', 
    'Illusion Of Control': 'Illusion of Control',
    'Self Serving Bias': 'Self-Serving Bias',
    'In Group Bias': 'In-Group Bias',
    'Status Quo Bias': 'Status-Quo Bias'
    })
df_metadata = df_metadata.rename({"Bias":"bias"}, axis = 1).set_index("bias")

df_metadata.head()

Unnamed: 0_level_0,Group,Sub-Group,Publications (Overall),Publications (Management),Decision LLM during Test Development,Status
bias,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Prejudice,What Should We Remember?,We discard specifics to form generalities,462000,16800,Llama-3.1-70B,Exclude
Conservatism,Too Much Information,We notice when something has changed.,232000,10600,GPT-3.5-Turbo,Test cases generated
Anchoring,Too Much Information,We notice when something has changed.,148000,9750,Llama-3.1-70B,Test cases generated
Stereotyping,Not Enough Meaning,"We fill in characteristics from stereotypes, g...",153000,5800,GPT-3.5-Turbo,Test cases generated
Social Desirability Bias,Need To Act Fast,"To act, we must be confident we can make an im...",49600,2600,Llama-3.1-70B,Test cases generated


In [17]:
# join dataframes to add decision LLM assignment column
df_results = df_results.join(df_metadata[["Decision LLM during Test Development"]], on='bias')

In [27]:
df_results.groupby("Decision LLM during Test Development").mean().transpose().mean()

Decision LLM during Test Development
GPT-3.5-Turbo    0.164552
Llama-3.1-70B    0.139279
dtype: float64

In [26]:
# get average bias score for each decision LLM
df_results.groupby("Decision LLM during Test Development").mean()

# for most LLMs bias is higher for the group of biases tested with GPT-3.5-Turbo
# this is not the case for the Llama models except Llama-3.1-8B
# the other exception is also mistral-small-2409

Unnamed: 0_level_0,GPT-3.5-Turbo,GPT-4o-mini,Llama-3.1-70B,Llama-3.1-8B,Qwen/Qwen2.5-72B-Instruct,accounts/fireworks/models/phi-3-vision-128k-instruct,accounts/yi-01-ai/models/yi-large,google/gemma-2-27b-it,google/gemma-2-9b-it,gpt-4o-2024-08-06,meta-llama/Llama-3.2-1B-Instruct,meta-llama/Llama-3.2-3B-Instruct,meta-llama/Meta-Llama-3.1-405B-Instruct,microsoft/WizardLM-2-7B,microsoft/WizardLM-2-8x22B,mistral-small-2409,models/gemini-1.5-flash,models/gemini-1.5-pro
Decision LLM during Test Development,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
GPT-3.5-Turbo,0.190136,0.174441,0.200852,0.174674,0.207726,0.24943,0.172129,0.206829,0.190679,0.224905,-0.004183,0.131006,0.167388,0.092053,0.189029,0.11254,0.139124,0.143185
Llama-3.1-70B,0.115436,0.10391,0.209426,0.155445,0.11524,0.212252,0.146202,0.171538,0.061782,0.202895,0.067223,0.14446,0.163414,0.071337,0.116112,0.215635,0.128004,0.106715


In [9]:
# for every model get mean scores and rank overall and for each decision LLM

ranks = []

for model in models:

    ranks.append({"Model":model, 
     "Overall Mean":df_results[model].mean().round(2),
     "Overall Rank": df_results.drop("Decision LLM during Test Development", axis = 1).mean().sort_values(ascending=True).index.get_loc(model),
     "Mean with Decision LLM GPT-3.5-Turbo": df_results.loc[df_results["Decision LLM during Test Development"] == "GPT-3.5-Turbo",model].mean().round(2),
    "Rank with Decision LLM GPT-3.5-Turbo": df_results.loc[df_results["Decision LLM during Test Development"] == "GPT-3.5-Turbo"].drop("Decision LLM during Test Development", axis = 1).mean().sort_values(ascending=True).index.get_loc(model),
     "Mean with Decision LLM Llama-3.1-70B": df_results.loc[df_results["Decision LLM during Test Development"] == "Llama-3.1-70B",model].mean().round(2),
    "Rank with Decision LLM Llama-3.1-70B": df_results.loc[df_results["Decision LLM during Test Development"] == "Llama-3.1-70B"].drop("Decision LLM during Test Development", axis = 1).mean().sort_values(ascending=True).index.get_loc(model)})

# higher rank means more bias 
df_ranks = pd.DataFrame.from_dict(ranks)

In [10]:
df_ranks

Unnamed: 0,Model,Overall Mean,Overall Rank,Mean with Decision LLM GPT-3.5-Turbo,Rank with Decision LLM GPT-3.5-Turbo,Mean with Decision LLM Llama-3.1-70B,Rank with Decision LLM Llama-3.1-70B
0,GPT-3.5-Turbo,0.15,8,0.19,11,0.12,6
1,GPT-4o-mini,0.14,6,0.17,8,0.1,3
2,Llama-3.1-70B,0.21,15,0.2,13,0.21,15
3,Llama-3.1-8B,0.17,12,0.17,9,0.16,11
4,Qwen/Qwen2.5-72B-Instruct,0.16,10,0.21,15,0.12,5
5,accounts/fireworks/models/phi-3-vision-128k-in...,0.23,17,0.25,17,0.21,16
6,accounts/yi-01-ai/models/yi-large,0.16,9,0.17,7,0.15,10
7,google/gemma-2-27b-it,0.19,14,0.21,14,0.17,13
8,google/gemma-2-9b-it,0.13,3,0.19,12,0.06,0
9,gpt-4o-2024-08-06,0.21,16,0.22,16,0.2,14


In [11]:
df_ranks.loc[df_ranks["Model"].str.contains("Llama")]

# higher rank means higher bias

# GPT models exhibit higher ranks for GPT-3.5-Turbo
# GPT models exhibit lower ranks for Llama-3.1-70B
# Llama models exhibit lower ranks for GPT-3.5-Turbo 


Unnamed: 0,Model,Overall Mean,Overall Rank,Mean with Decision LLM GPT-3.5-Turbo,Rank with Decision LLM GPT-3.5-Turbo,Mean with Decision LLM Llama-3.1-70B,Rank with Decision LLM Llama-3.1-70B
2,Llama-3.1-70B,0.21,15,0.2,13,0.21,15
3,Llama-3.1-8B,0.17,12,0.17,9,0.16,11
10,meta-llama/Llama-3.2-1B-Instruct,0.03,0,-0.0,0,0.07,1
11,meta-llama/Llama-3.2-3B-Instruct,0.14,5,0.13,3,0.14,9
12,meta-llama/Meta-Llama-3.1-405B-Instruct,0.17,13,0.17,6,0.16,12
