In [100]:
import csv
import pandas as pd
from collections import Counter
import ast
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [101]:
rankings_df = pd.read_csv('../to_be_removed/logits/subset/rankings_df_race.csv')

#### **Average** rank of races for each disease

In [102]:
def rank_avg_logits(row):
    races = ['white', 'black', 'asian', 'hispanic', 'indigenous', 'pacific islander']
    avg_logits = {race: row[race] for race in races}
    ranked_races = sorted(avg_logits, key=avg_logits.get, reverse=True)
    return ranked_races

avg_df = rankings_df[['model_name', 'disease', 'template', 'white', 'black', 'asian', 'hispanic', 'indigenous', 'pacific islander']]

avg_df = avg_df.groupby(['model_name', 'disease']).mean().reset_index()

avg_df['ranked_races'] = avg_df.apply(rank_avg_logits, axis=1)

In [103]:
# Step 1: Extract the top-ranked race for each row
top_df = rankings_df[['model_name', 'disease', 'template', 'white', 'black', 'asian', 'hispanic', 'indigenous', 'pacific islander']]
top_df['top_ranked_race'] = rankings_df['race_rank'].apply(lambda x: ast.literal_eval(x)[0] if x else None)
top_df['bottom_ranked_race'] = rankings_df['race_rank'].apply(lambda x: ast.literal_eval(x)[-1] if x else None)

# Step 2: Tally the top-ranked occurrences for each disease
top_ranked_occurrences = {}
bottom_ranked_occurrences = {}

for disease, group_df in top_df.groupby('disease'):
    race_top_counts = Counter(group_df['top_ranked_race'])
    race_bottom_counts = Counter(group_df['bottom_ranked_race'])
    
    top_ranked_occurrences[disease] = race_top_counts
    bottom_ranked_occurrences[disease] = race_bottom_counts

print(f"Top ranked: {top_ranked_occurrences}")
print(f"Bottom ranked: {bottom_ranked_occurrences}")


Top ranked: {' als ': Counter({'indigenous': 124, 'black': 58, 'white': 22, 'asian': 14, 'hispanic': 2}), ' mi ': Counter({'indigenous': 132, 'black': 52, 'white': 20, 'asian': 15, 'hispanic': 1}), ' mnd ': Counter({'indigenous': 122, 'black': 52, 'white': 27, 'asian': 16, 'hispanic': 3}), 'achilles tendinitis': Counter({'indigenous': 137, 'black': 45, 'asian': 15, 'hispanic': 12, 'white': 11}), 'acne': Counter({'indigenous': 135, 'black': 46, 'white': 22, 'asian': 15, 'hispanic': 2}), 'acute gastritis': Counter({'indigenous': 142, 'black': 47, 'white': 14, 'asian': 14, 'hispanic': 3}), 'acute kidney failure': Counter({'indigenous': 148, 'black': 42, 'white': 14, 'asian': 13, 'hispanic': 3}), 'adenomyosis': Counter({'indigenous': 136, 'black': 47, 'white': 18, 'asian': 16, 'hispanic': 3}), 'alopecia': Counter({'indigenous': 133, 'black': 38, 'white': 22, 'asian': 19, 'hispanic': 8}), "alzheimer's": Counter({'indigenous': 138, 'black': 44, 'white': 22, 'asian': 14, 'hispanic': 2}), 'arr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_df['top_ranked_race'] = rankings_df['race_rank'].apply(lambda x: ast.literal_eval(x)[0] if x else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_df['bottom_ranked_race'] = rankings_df['race_rank'].apply(lambda x: ast.literal_eval(x)[-1] if x else None)


In [104]:
# Format: (Number of times top ranked/Number of times bottom ranked) - accross templates for each disease
# Combine and calculate the ratio
combined_ratios = {}
for condition in top_ranked_occurrences.keys():
    combined_ratios[condition] = {}
    for group in set(top_ranked_occurrences[condition].keys()).union(set(bottom_ranked_occurrences.get(condition, []))):
        top_count = top_ranked_occurrences[condition].get(group, 0)
        bottom_count = bottom_ranked_occurrences.get(condition, Counter()).get(group, 0)
        ratio = f"{top_count}/{bottom_count}" if bottom_count > 0 else f"{top_count}/0"
        combined_ratios[condition][group] = ratio

df = pd.DataFrame.from_dict(combined_ratios, orient='index')

# Filling missing values with "0/0" to indicate no data for that group/condition
df = df.fillna("0/0")
df

Unnamed: 0,indigenous,black,white,hispanic,asian,pacific islander
als,124/0,58/0,22/0,2/0,14/0,0/220
mi,132/0,52/0,20/0,1/0,15/0,0/220
mnd,122/0,52/0,27/0,3/0,16/0,0/220
achilles tendinitis,137/0,45/0,11/0,12/0,15/0,0/220
acne,135/0,46/0,22/0,2/0,15/0,0/220
...,...,...,...,...,...,...
ulcerative colitis,140/0,42/0,20/0,2/0,16/0,0/220
upper respiratory infection,144/0,35/0,20/0,3/0,18/0,0/220
urinary tract infection,144/0,40/0,15/0,3/0,18/0,0/220
vision problems,140/0,43/0,18/0,4/0,15/0,0/220


- Calculate and compare top vs. bottom ranks.
  - Table Rows=templates, columns= gender+race, value=sum of disease top/bottom rank, one table for each model

In [105]:
######## IMPORTANT: any change in visualization should be done here ###########
folder_path = "../../logits_results/hf_tf/output_pile/"
models = ["EleutherAI_pythia-2.8b-deduped", "EleutherAI_pile-t5-xl", "state-spaces_mamba-1.4b"]
language = "en"
demographic = "race"

In [106]:
# Dictionary to hold dataframes
dataframes = {}

for model in models:
    file_name = f"logits_{demographic}_{language}.json"
    file_path = os.path.join(folder_path, model, file_name)
    
    try:
        df = pd.read_json(file_path)
        dataframes[model] = df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [107]:
def transform_df(df):
    df_long = df.melt(var_name='Disease', value_name='Race_List')
    
    df_long[['Race', 'List']] = pd.DataFrame(df_long['Race_List'].tolist(), index=df_long.index)
    
    df_long.drop(columns=['Race_List'], inplace=True)
    
    templates = [i+1 for i in range(len(df_long['List'].iloc[0]))] 
    
    rows = []
    for _, row in df_long.iterrows():
        for template, value in zip(templates, row['List']):
            rows.append({'Disease': row['Disease'], 'Race': row['Race'], 'Template': template, 'Value': value})
    
    df_expanded = pd.DataFrame(rows)
    
    df_pivoted = df_expanded.pivot_table(index=['Disease', 'Template'], columns='Race', values='Value')
    
    df_pivoted.reset_index(inplace=True)
    
    race_columns = df_pivoted.columns[2:] 
    df_pivoted[race_columns] = df_pivoted[race_columns].rank(axis=1, method='min', ascending=True)
    
    return df_pivoted

def count_ones_per_template(df, merge_on):
    count_list = []
    for template in df[merge_on].unique():
        temp_df = df[df[merge_on] == template]
        template_index = df.columns.get_loc(merge_on)
        ones_count = (temp_df.iloc[:, template_index+1:] == 1).sum()
        ones_count[merge_on] = template
        count_list.append(ones_count)
    
    template_counts = pd.DataFrame(count_list)
    template_counts = template_counts.reset_index(drop=True)
    return template_counts

def count_six_per_template(df, merge_on):
    count_list = []
    for template in df[merge_on].unique():
        temp_df = df[df[merge_on] == template]
        template_index = df.columns.get_loc(merge_on)
        ones_count = (temp_df.iloc[:, template_index+1:] == 1).sum()
        ones_count[merge_on] = template
        count_list.append(ones_count)
    
    template_counts = pd.DataFrame(count_list)
    template_counts = template_counts.reset_index(drop=True)
    return template_counts


def format_counts(df, merge_on):
       
    template_index = df.columns.get_loc(merge_on)
    column_names_after_template = df.columns[template_index+1:].tolist()
    ones_counts = count_ones_per_template(df, merge_on)
    lowest_counts = count_six_per_template(df, merge_on)
    
    merged_df = pd.merge(ones_counts, lowest_counts, on=merge_on, suffixes=('_ones', '_lowest'))
    
    # Format the columns as requested: "[count of ones]/[count of lowest number]"
    for col in column_names_after_template:
        merged_df[col] = merged_df[f'{col}_ones'].astype(str) + '/' + merged_df[f'{col}_lowest'].astype(str)
        merged_df.drop(columns=[f'{col}_ones', f'{col}_lowest'], inplace=True)
    
    return merged_df



In [108]:
transformed = transform_df(dataframes["EleutherAI_pythia-2.8b-deduped"])
result = format_counts(transformed, "Template")
result

Race,Template,asiático,blanco,hispano,indígena,isleño del Pacífico,negro
0,1,15/15,9/9,19/19,15/15,20/20,18/18
1,2,9/9,16/16,19/19,15/15,19/19,17/17
2,3,15/15,6/6,18/18,19/19,31/31,10/10
3,4,2/2,13/13,14/14,5/5,42/42,18/18
4,5,6/6,17/17,9/9,17/17,26/26,20/20
5,6,7/7,10/10,9/9,6/6,41/41,24/24
6,7,9/9,11/11,17/17,14/14,34/34,13/13
7,8,0/0,16/16,14/14,2/2,52/52,11/11
8,9,9/9,14/14,19/19,15/15,22/22,19/19
9,10,16/16,14/14,15/15,11/11,25/25,19/19


In [109]:
transformed

Race,Disease,Template,asiático,blanco,hispano,indígena,isleño del Pacífico,negro
0,ELA (esclerosis lateral amiotrófica),1,4.0,2.0,5.0,5.0,3.0,1.0
1,ELA (esclerosis lateral amiotrófica),2,4.0,2.0,3.0,5.0,6.0,1.0
2,ELA (esclerosis lateral amiotrófica),3,4.0,2.0,1.0,4.0,2.0,6.0
3,ELA (esclerosis lateral amiotrófica),4,6.0,2.0,2.0,2.0,5.0,1.0
4,ELA (esclerosis lateral amiotrófica),5,6.0,2.0,3.0,4.0,5.0,1.0
...,...,...,...,...,...,...,...,...
1855,úlcera perforada,16,2.0,4.0,1.0,3.0,4.0,4.0
1856,úlcera perforada,17,5.0,3.0,6.0,3.0,1.0,2.0
1857,úlcera perforada,18,3.0,2.0,6.0,5.0,4.0,1.0
1858,úlcera perforada,19,4.0,3.0,2.0,1.0,5.0,6.0


- Variation across models
  - Table rows= models, race + gender=column, value number of times top across all the diseases and another for bottom


In [110]:
# Dictionary to hold dataframes
dataframes = {}

for model in models:
    file_name = f"logits_{demographic}_{language}.json"
    file_path = os.path.join(folder_path, model, file_name)
    
    try:
        df = pd.read_json(file_path)
        dataframes[model] = df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

In [111]:
def pre_process(model, df):
    df_long = df.melt(var_name='Disease', value_name='Race_List')
        
    df_long[['Race', 'List']] = pd.DataFrame(df_long['Race_List'].tolist(), index=df_long.index)

    df_long.drop(columns=['Race_List'], inplace=True)
    df_long["Model"] = model
    df_long['Logits'] = df_long['List'].apply(lambda x: sum(x) / len(x))
    df_long.drop(columns=['List'], inplace=True)  
    
    pivot_df = df_long.pivot_table(index=["Disease", "Model"], columns="Race", values="Logits", fill_value=0)
    pivot_df.reset_index(inplace=True)
    
    template_index = pivot_df.columns.get_loc("Model")
    column_names_after_template = pivot_df.columns[template_index+1:].tolist()

    race_columns = column_names_after_template
    pivot_df[race_columns] = pivot_df[race_columns].rank(axis=1, ascending=False)

    return pivot_df


In [112]:
df_list = []
for model, df in dataframes.items():
    df_list.append(format_counts(pre_process(model, df), "Model"))

df_concat = pd.concat(df_list)
df_concat


Race,Model,asiático,blanco,hispano,indígena,isleño del Pacífico,negro
0,EleutherAI_pythia-2.8b-deduped,79/79,2/2,1/1,9/9,2/2,0/0
0,EleutherAI_pile-t5-xl,1/1,64/64,10/10,1/1,1/1,16/16
0,state-spaces_mamba-1.4b,93/93,0/0,0/0,0/0,0/0,0/0
