In [5]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import f1_score

In [6]:
# Function to convert the string to a list of integers or floats
def convert_to_list(string):
    # Remove the brackets
    string = string.strip("[]")
    
    # Convert the string to a list of floats or ints
    if '.' in string:  # Check if the numbers are floats
        return list(map(float, string.split()))
    else:
        return list(map(int, string.split(',')))

In [7]:
# Load the dataset
sample_df = pd.read_csv('full_output_frame_zero_shot_high_res.csv')

# Convert string representations of lists back to actual lists
sample_df['parsed_answer'] = sample_df['parsed_answer'].apply(lambda x: convert_to_list(x))
sample_df['ground_truth'] = sample_df['ground_truth'].apply(lambda x: convert_to_list(x))

y_true = sample_df["ground_truth"].tolist()
y_pred = sample_df["parsed_answer"].tolist()

# Number of bootstrap samples
n_bootstrap_samples = 1000

# Store the F1 scores
f1_scores = []

# Perform bootstrapping
for _ in range(n_bootstrap_samples):
    # Resample with replacement
    y_true_resampled, y_pred_resampled = resample(y_true, y_pred, replace=True)
    
    # Calculate the F1 score
    f1 = f1_score(y_true_resampled, y_pred_resampled, average='macro')
    f1_scores.append(f1)

# Calculate the average F1 score over all bootstrapped samples
bootstrap_avg_f1 = np.mean(f1_scores)
bootstrap_std_f1 = np.std(f1_scores)

print(f'Bootstrap Average F1 Score: {bootstrap_avg_f1}')
print(f'Bootstrap F1 Score Standard Deviation: {bootstrap_std_f1}')

Bootstrap Average F1 Score: 0.2143597283066372
Bootstrap F1 Score Standard Deviation: 0.01187191089834755


In [8]:
y_true_white = sample_df[sample_df.race=='White']["ground_truth"].tolist()
y_pred_white = sample_df[sample_df.race=='White']["parsed_answer"].tolist()

y_true_black = sample_df[sample_df.race=='Black']["ground_truth"].tolist()
y_pred_black = sample_df[sample_df.race=='Black']["parsed_answer"].tolist()

print('White F1: ',f1_score(y_true_white, y_pred_white, average='macro'))
print('Black F1: ',f1_score(y_true_black, y_pred_black, average='macro'))

White F1:  0.21493140982288267
Black F1:  0.21195375845242478


In [9]:
y_true_white = sample_df[sample_df.sex=='Male']["ground_truth"].tolist()
y_pred_white = sample_df[sample_df.sex=='Male']["parsed_answer"].tolist()

y_true_black = sample_df[sample_df.sex=='Female']["ground_truth"].tolist()
y_pred_black = sample_df[sample_df.sex=='Female']["parsed_answer"].tolist()

print('Male F1: ',f1_score(y_true_white, y_pred_white, average='macro'))
print('Female F1: ',f1_score(y_true_black, y_pred_black, average='macro'))

Male F1:  0.2264418581178549
Female F1:  0.196686408700425


In [10]:
n_samples = 499
n_labels = 14

# Random predictions for each label
y_pred_random = np.random.randint(0, 2, size=(n_samples, n_labels))

# Calculate F1 score
f1_random = f1_score(y_true, y_pred_random, average='macro')
print('Random F1: ', f1_random)

Random F1:  0.22004752111796164
