### Showcase FairCLIP Models

# 1. Dataset

The dataset is the Harvard-FairVLMed Dataset [[1]](#1) from Luo et al. (2024) [[2]](#2).

In [None]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from IPython.display import display, Image


In [None]:
DATA_DIR = Path("./data")
RESULTS_DIR = Path("./results")

The dataset provides data on several features with characteristics relating to fairness, as well as ground truth data on glaucoma diagnosis and a summary of the clinical notes created using GPT-4. 

In [None]:
df = pd.read_csv(DATA_DIR / 'Harvard-FairVLMed/data_summary.csv')

# bar plot glaucoma
l1 = ['gender', 'race', 'ethnicity', 'language', 'maritalstatus', 'glaucoma']
fig,ax = plt.subplots(nrows=4, ncols=2, figsize=(10,14)) 
fig.subplots_adjust(hspace=1)

for cat_n in range(len(l1)):
    counts = df[[l1[cat_n]]].value_counts()
    counts.plot(ax = ax[cat_n//2, cat_n%2], kind = 'bar', ylabel = 'frequency') 
    labels = [str(i[0]) for i in counts.index]
    ax[cat_n//2, cat_n%2].set_xticklabels(labels, rotation=35)
    ax[cat_n//2, cat_n%2].yaxis.set_major_formatter(mtick.PercentFormatter(xmax=len(df)))
    ax[cat_n//2, cat_n%2].set_ylim(-1, 10000)
    ax[cat_n//2, cat_n%2].set_title(l1[cat_n])
    ax[cat_n//2, cat_n%2].grid(axis='y')
    # place value on top of bar
    for i in range(len(counts)):
        ax[cat_n//2, cat_n%2].text(i, counts[i], round(counts[i]/sum(counts)*100,1), ha = 'center', va = 'bottom')
df.age.plot(ax = ax[3,0], kind = 'hist', bins = 20, ylabel = 'frequency', title = 'age distribution')
ax[3,0].set_xlabel('age (years)')
ax[3, 0].grid(axis='y')
notes_length = df['gpt4_summary'].apply(lambda x: len(x.split()))
notes_length.plot(ax = ax[3,1], kind = 'hist', bins = 20, ylabel = 'frequency', title = 'GPT-4 note summary length distribution')
ax[3,1].set_xlabel('length (words)')
ax[3, 1].grid(axis='y')

fig.show() 


In [None]:
l1 = ['gender', 'race', 'ethnicity', 'language']

df[l1].value_counts().plot(kind = 'bar', ylabel = 'frequency', title = 'glaucoma')

The dataset also includes Ophthalmoscopy images, which are used by medical professionals to determine prescence of Glaucoma

In [None]:
# display images in grid
fig, ax = plt.subplots(2, 2, figsize=(7.7,10))
fig.subplots_adjust(hspace=0.01, wspace=0.01)
ax[0,0].imshow(plt.imread(DATA_DIR / 'Harvard-FairVLMed/Training/slo_fundus_00001.jpg'))
ax[0,1].imshow(plt.imread(DATA_DIR / 'Harvard-FairVLMed/Training/slo_fundus_00002.jpg'))
ax[1,0].imshow(plt.imread(DATA_DIR / 'Harvard-FairVLMed/Training/slo_fundus_00003.jpg'))
ax[1,1].imshow(plt.imread(DATA_DIR / 'Harvard-FairVLMed/Training/slo_fundus_00004.jpg'))

for i in range(2):
    for j in range(2):
        ax[i,j].axis('off')
fig.show()

## Models & Experiments
The models are based on 2 main architectures: CLIP and BLIP2, #TODO (Toevoegen of we BLIP2 ook Fair runnen), using the CLIP architecture a FairCLIP model is also generated. 

In [None]:
df_results = pd.read_csv(RESULTS_DIR / 'best_vit-b16_slo_fundus.csv').loc[3:]
df_results.columns

In [None]:
def get_results(df_results):
    columns = df_results.columns
    # get columns containing esacc
    race_columns = [col for col in columns if 'attr0' in col]
    gender_columns = [col for col in columns if 'attr1' in col]
    ethnicity_columns = [col for col in columns if 'attr2' in col]
    language_columns = [col for col in columns if 'attr3' in col]

    df_results_race = df_results[race_columns].copy()
    df_results_gender = df_results[gender_columns].copy()
    df_results_ethnicity = df_results[ethnicity_columns].copy()
    df_results_language = df_results[language_columns].copy()

    grouped_params = [(race_columns, df_results_race, 'attr0', 'Race'), (gender_columns, df_results_gender, 'attr1', 'Gender'), (ethnicity_columns, df_results_ethnicity, 'attr2', 'Ethnicity'), (language_columns, df_results_language, 'attr3', 'Language')]

    # get mean and standard deviation for each column and add to df with appended name
    for (esacc_columns, partial_df, attr_idx, attr_name) in grouped_params:
        for col in esacc_columns:
            mean = partial_df[col].mean()*100
            std = partial_df[col].std()*100
            colname = col.replace('_' + attr_idx, '').strip()
            partial_df.loc[:, colname + '_mean'] = mean
            partial_df.loc[:, colname + '_std'] = std
            partial_df.drop(columns=[col], inplace=True)
        partial_df.insert(0, 'Attribute', attr_name)

        if "  auc" in df_results:
            partial_df.insert(0, 'auc_mean', df_results['  auc'].mean())
            partial_df.insert(0, 'auc_std', df_results['  auc'].std())
        else:
            partial_df.insert(0, 'auc_mean', df_results['auc'].mean())
            partial_df.insert(0, 'auc_std', df_results['auc'].std())
        partial_df.drop_duplicates(inplace=True)
        if attr_name == 'Race':
            df_results_final = partial_df.copy()
        else:
            df_results_final = pd.concat([df_results_final, partial_df], ignore_index=True)

    # add column Attribute with value Race
    df_results_final.fillna(' ', inplace=True)


    # reorder columns
    order = ['Attribute','dpd_mean', 'dpd_std', 'eod_mean', 'eod_std', 'auc_mean', 'auc_std', 'esauc_mean', 'esauc_std', 'auc_group0_mean', 'auc_group0_std', 'auc_group1_mean', 'auc_group1_std', 'auc_group2_mean', 'auc_group2_std']
    df_results_final = df_results_final[order]
    # round auc mean and std to 2 decimals
    df_results_final['auc_mean'] = df_results_final['auc_mean'].mean()*100
    df_results_final['auc_std'] = df_results_final['auc_std'].mean()*100

    return df_results_final

def print_results_latex(df_results_final):
    order = ['Attribute', 'dpd_mean', 'dpd_std', 'eod_mean', 'eod_std', 'auc_mean', 'auc_std', 'esauc_mean', 'esauc_std', 'auc_group0_mean', 'auc_group0_std', 'auc_group1_mean', 'auc_group1_std', 'auc_group2_mean', 'auc_group2_std']
    attributes = df_results_final['Attribute'].unique()
    attributes = list(attributes)
    n_models = len(df_results_final['Model'].unique())
    
    print(f"Order of attributes: {attributes}\n")
    
    for attribute in attributes:
        df_results_final_slice = df_results_final[df_results_final['Attribute'] == attribute]
        for x in range(n_models):
            # order the slice
            df_results_final_ordered_slice = df_results_final_slice[order]
            # print(df_results_final_ordered_slice)
            # store row 1 as list
            row1 = df_results_final_ordered_slice.iloc[x].values.tolist()
            results = []
            for i in range(len(row1)-1):
                if i % 2 == 0:
                    # if row1[i+1] is empty add empty string
                    if row1[i+1] == ' ':
                        results.append(' ')
                    # if row1[i+1] is highest value boldface
                    elif row1[i+1] == max(df_results_final_ordered_slice.iloc[:, i+1]) and df_results_final_ordered_slice.columns[i+1] not in ['dpd_mean', 'eod_mean']:
                        results.append('\\textbf{' + f"{row1[i+1]:.2f}" + '}' + ' $\pm$ ' + f"{row1[i+2]:.2f}")
                    elif row1[i+1] == min(df_results_final_ordered_slice.iloc[:, i+1]) and df_results_final_ordered_slice.columns[i+1] in ['dpd_mean', 'eod_mean']:
                        results.append('\\textbf{' + f"{row1[i+1]:.2f}" + '}' + ' $\pm$ ' + f"{row1[i+2]:.2f}")
                    # if second highest, underline
                    elif row1[i+1] == sorted(df_results_final_ordered_slice.iloc[:, i+1])[-2] and df_results_final_ordered_slice.columns[i+1] not in ['dpd_mean', 'eod_mean']:
                        results.append('\\underline{' + f"{row1[i+1]:.2f}"+ '}' + ' $\pm$ ' + f"{row1[i+2]:.2f}" )
                    elif row1[i+1] == sorted(df_results_final_ordered_slice.iloc[:, i+1])[1] and df_results_final_ordered_slice.columns[i+1] in ['dpd_mean', 'eod_mean']:
                        results.append('\\underline{' + f"{row1[i+1]:.2f}"+ '}' + ' $\pm$ ' + f"{row1[i+2]:.2f}" )
                    else:
                        results.append(f"{row1[i+1]:.2f}" + ' $\pm$ ' + f"{row1[i+2]:.2f}")
            # get model name idx x and attribute = Attribute, value model_name
            model_name = df_results_final_slice.iloc[x]['Model']
            print( ' & ' + model_name + ' & ' +' & '.join(results) + ' \\\\')
        print()


def create_full_table(csv_paths):
    df_results_final = None
    
    for path, model_name in csv_paths:
        df_output = pd.read_csv(path)#.loc[3:]
        df_results = get_results(df_output)
        # add model name to df
        df_results.insert(0, 'Model', model_name)

        if df_results_final is None:
            df_results_final = df_results.copy()
        else:
            df_results_final = pd.concat([df_results_final, df_results], ignore_index=True)
    return df_results_final


def create_full_table_singular(path, name_path_mapping):
    df_results_final = None
    df_output = pd.read_csv(path)

    for name, paths in name_path_mapping.items():
        df_results = get_results(df_output[df_output["path"].isin(paths)])
        df_results.insert(0, 'Model', name)

        if df_results_final is None:
            df_results_final = df_results.copy()
        else:
            df_results_final = pd.concat([df_results_final, df_results], ignore_index=True)
    
    return df_results_final


## FOR LINEAR PROBING
# df_results_final = create_full_table_singular(
#     path="../src/linear_probing_results_3.csv",
#     name_path_mapping={
#         "CLIP": [f"CLIP_{i}" for i in range(1, 4)],
#         "CLIP-FT": ["CLIP_FT_seed1542", "CLIP_FT_seed2928", "CLIP_FT_seed9350"]
#     }
# )
# display(df_results_final)
# print_results_latex(df_results_final)


## FOR FINETUNING
csv_paths = [
    [RESULTS_DIR / "result_vl14_clip.csv", "CLIP (ViT-L/14)"],
    [RESULTS_DIR / "result_vl14_race.csv", "FairCLIP-race (ViT-L/14)"],
    [RESULTS_DIR / "result_vl14_gender.csv", "FairCLIP-gender (ViT-L/14)"],
    [RESULTS_DIR / "result_vl14_ethnicity.csv", "FairCLIP-ethnicity (ViT-L/14)"],
    [RESULTS_DIR / "result_vl14_language.csv", "FairCLIP-language (ViT-L/14)"],
]
df_results_final = create_full_table(csv_paths)
print_results_latex(df_results_final)



# display(df_results_final)

In [None]:
csv_paths = [['../results/best_vit-b16_slo_fundus.csv', 'Clip-ViT-L/14'], ['../results/best_vit-b16_slo_fundus_race_FairCLIP.csv', 'FairCLIP-Finetuned-Race'], ['../results/best_vit-b16_slo_fundus_gender_FairCLIP.csv', 'FairCLIP-Finetuned-Gender'], ['../results/best_vit-b16_slo_fundus_ethnicity_FairCLIP.csv','FairCLIP-Finetuned-Ethnicity'], ['../results/best_vit-b16_slo_fundus_language_FairCLIP.csv', 'FairCLIP-Finetuned-Language']]

df_results_final = create_full_table(csv_paths)

print_results_latex(df_results_final)

display(df_results_final)


In [None]:
\begin{table*}[ht]
\centering
\scriptsize
\caption{Zero-shot transfer results of CLIP vs. FairCLIP, reporting the mean and standard deviation across three random seeds.
% We report the mean and standard deviation across three random seeds for all experiments. The scores are in persentage.
}
\vspace{-2ex}
\label{tab:zero_shot_clip}
\adjustbox{width=.5\textwidth}{
\begin{tabular}{llccc}
\toprule
\textbf{Attribute} & \textbf{Model}  & \multicolumn{3}{c}{\textbf{Group-wise AUC $\uparrow$}} \\ \midrule
&&\textbf{Asian} & \textbf{Black} & \textbf{White} \\
\multirow{4}{*}{\textbf{Race}} 
 & CLIP-ViT-L/14 & 64.6 $\pm$ 6.64 & 65.74 $\pm$ 8.25 & 62.93 $\pm$ 5.65 \\
 & FairCLIP-Race & \underline{69.88} $\pm$ 2.2 & \underline{69.31} $\pm$ 1.53 & \underline{66.13} $\pm$ 2.3 \\
 & FairCLIP-Gender & 69.07 $\pm$ 2.8 & \textbf{69.87} $\pm$ 2.95 & 64.32 $\pm$ 2.08 \\
 & FairCLIP-Ethnicity & 63.57 $\pm$ 2.0 & 66.99 $\pm$ 6.09 & 61.61 $\pm$ 1.23 \\
 & FairCLIP-Language &\textbf{ 73.08} $\pm$ 6.24 & 69.02 $\pm$ 3.52 & \textbf{67.19 }$\pm$ 4.7 \\                       \midrule
&&\textbf{Female} & \textbf{Male} \\
\multirow{4}{*}{\textbf{Gender}} 
 & CLIP-ViT-L/14 & 61.61 $\pm$ 5.24 & 67.69 $\pm$ 7.11 &   \\
 & FairCLIP-Race & \underline{64.35} $\pm$ 1.9 & \underline{71.32} $\pm$ 1.83 &      \\
 & FairCLIP-Gender & 63.39 $\pm$ 2.26 & 70.15 $\pm$ 2.01 &      \\
 & FairCLIP-Ethnicity & 61.13 $\pm$ 1.52 & 66.28 $\pm$ 3.81 &     \\
 & FairCLIP-Language &\textbf{ 66.08} $\pm$ 4.44 & \textbf{71.62} $\pm$ 3.69 &    \\
                               \midrule
&&\textbf{Non-Hispanic} & \textbf{Hispanic} \\
\multirow{4}{*}{\textbf{Ethnicity}} 
 & CLIP-ViT-L/14 & 64.48 $\pm$ 6.14 & 56.24 $\pm$ 3.04 &    \\
 & FairCLIP-Race & \underline{67.66} $\pm$ 1.97 & \underline{58.91} $\pm$ 2.66 &     \\
 & FairCLIP-Gender & 66.54 $\pm$ 2.17 & 57.94 $\pm$ 0.74 &      \\
 & FairCLIP-Ethnicity & 63.63 $\pm$ 2.3 & 53.88 $\pm$ 4.68 &      \\
  & FairCLIP-Language & \textbf{68.62} $\pm$ 4.1 & \textbf{63.01} $\pm$ 4.75 &     \\
                                \midrule
&&\textbf{English} & \textbf{Spanish} & \textbf{Others} \\
\multirow{4}{*}{\textbf{Language}} 
 & CLIP-ViT-L/14 & 63.97 $\pm$ 6.25 & \textbf{68.37} $\pm$ 3.13 & 61.13 $\pm$ 2.87 \\
 & FairCLIP-Race & \underline{67.23} $\pm$ 1.95 & 62.88 $\pm$ 3.27 & \textbf{63.57} $\pm$ 3.6 \\
 & FairCLIP-Gender & 66.14 $\pm$ 2.14 & \underline{64.77} $\pm$ 0.49 & 60.98 $\pm$ 2.58 \\
 & FairCLIP-Ethnicity & 63.26 $\pm$ 2.2 & 56.91 $\pm$ 14.44 & 56.51 $\pm$ 2.45 \\
& FairCLIP-Language & \textbf{68.43} $\pm$ 4.16 & \textbf{68.37} $\pm$ 7.99 & \underline{62.58} $\pm$ 6.21 \\
\bottomrule
\end{tabular}}
\end{table*}

## References
<a id="1">[1]</a> 
https://ophai.hms.harvard.edu/datasets/harvard-fairvlmed10k

<a id="2">[2]</a> 
Luo, Y., Shi, M., Khan, M. O., Afzal, M. M., Huang, H., Yuan, S., ... & Wang, M. (2024). Fairclip: Harnessing fairness in vision-language learning. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (pp. 12289-12301).