In [2]:
import json
import pandas as pd

In [20]:
file = "../scandeval_benchmark_results.jsonl"
with open(file, 'r') as f:
    json_list = list(f)

data = [json.loads(item) for item in json_list]
df = pd.DataFrame(data)

In [21]:
def format_mean_se(mean, se):
    return f"{mean:.2f} ± {se:.2f}"

results = []
for _, row in df.iterrows():
    task = row['task']
    model = row['model']
    dataset_languages = ', '.join(row['dataset_languages'])
    results_dict = row['results']['total']

    formatted_metrics = {}
    for metric, value in results_dict.items():
        if "_se" in metric:
            continue
        se_metric = metric + "_se"
        formatted_metrics[metric] = format_mean_se(value, results_dict[se_metric])

    # Model Information
    num_params = f"{int(row['num_model_parameters']):,}"
    max_seq_length = row['max_sequence_length']
    vocabulary_size = row['vocabulary_size']
    generative = row['generative']
    few_shot = row['few_shot']
    validation_split = row['validation_split']
    scandeval_version = row['scandeval_version']
    dataset = row['dataset']

    results.append({
        'Dataset': dataset,
        'Task': task,
        'Language(s)': dataset_languages,
        'Model': model,
        'Results': formatted_metrics,
        'Model Information': {
            'Number of Parameters': num_params,
            'Max Sequence Length': max_seq_length,
            'Vocabulary Size': vocabulary_size,
            'Generative': generative,
            'Few-shot': few_shot,
            'Validation Split': validation_split,
            'Scandeval Version': scandeval_version
        }
    })

In [22]:
x = 11 # Change this to see different models

print("Model: ", results[x*8]['Model'])
print("---------------------------------------------")
for i in range(x*8, x*8+8):
    print(results[i]['Dataset'], results[i]['Results'])

Model:  merged-models/gpt-sw3-6.7b-hopkok-v3-nosystem-DPO
---------------------------------------------
swerec {'test_mcc': '67.31 ± 3.03', 'test_macro_f1': '66.82 ± 2.68'}
suc3 {'test_micro_f1_no_misc': '31.97 ± 2.48', 'test_micro_f1': '20.12 ± 2.76'}
scala-sv {'test_mcc': '7.46 ± 2.47', 'test_macro_f1': '50.99 ± 2.71'}
scandiqa-sv {'test_em': '48.90 ± 1.25', 'test_f1': '55.62 ± 1.01'}
swedn {'test_bertscore': '61.62 ± 2.50', 'test_rouge_l': '17.60 ± 0.90'}
mmlu-sv {'test_mcc': '8.95 ± 1.51', 'test_accuracy': '31.67 ± 1.10'}
hellaswag-sv {'test_accuracy': '33.32 ± 1.31', 'test_mcc': '11.63 ± 1.87'}
speed {'test_speed': '5376.64 ± 1129.38', 'test_speed_short': '1300.07 ± 415.72'}


In [None]:
def parse_metric(metric_str):
    """Extract mean and standard deviation from metric string."""
    mean, std = metric_str.split(' ± ')
    return float(mean), float(std)

def std_deviation_away(your_metric, best_metric):
    """Calculate how many standard deviations your metric is away from the best metric."""
    your_mean, _ = parse_metric(your_metric)
    best_mean, best_std = parse_metric(best_metric)
    return abs(your_mean - best_mean) / best_std

def calculate_rank(your_metrics, best_metrics):
    """Calculate the rank of your model based on ScandEval formula."""
    std_away_values = []

    for category, metrics in your_metrics.items():
        for metric_name, your_value in metrics.items():
            best_value = best_metrics[category][metric_name]
            std_away = std_deviation_away(your_value, best_value)
            std_away_values.append(std_away)

    avg_std_away = sum(std_away_values) / len(std_away_values)
    return 1 + avg_std_away

metrics_json = {}
for i in range(8, 16):
    metrics_json[results[i]['Dataset']] = results[i]['Results']

metrics_json 

#Make a copy of metrics_json
best_json = {k: v.copy() for k, v in metrics_json.items()}


best_json['swerec']['test_mcc'] = '56.60 ± 3.37'
best_json['swerec']['test_macro_f1'] = '62.73 ± 3.61'
best_json['suc3']['test_micro_f1_no_misc'] = '14.58 ± 1.30'
best_json['suc3']['test_micro_f1'] = '14.79 ± 1.27'
best_json['scala-sv']['test_mcc'] = '10.92 ± 1.83'
best_json['scala-sv']['test_macro_f1'] = '52.63 ± 2.98'
best_json['scandiqa-sv']['test_em'] = '50.18 ± 0.54'
best_json['scandiqa-sv']['test_f1'] = '57.90 ± 0.53'
best_json['swedn']['test_bertscore'] = '64.89 ± 0.15'
best_json['swedn']['test_rouge_l'] = '18.79 ± 0.22'
best_json['mmlu-sv']['test_mcc'] = '6.16 ± 0.81'
best_json['mmlu-sv']['test_accuracy'] = '28.35 ± 0.97'
best_json['hellaswag-sv']['test_accuracy'] = '10.90 ± 0.86'
best_json['hellaswag-sv']['test_mcc'] = '32.01 ± 0.54'
best_json['speed']['test_speed'] = '2383 ± 451'
best_json['speed']['test_speed_short'] = '718 ± 221'

In [None]:
metrics_json, best_json

In [None]:
print(calculate_rank(metrics_json, best_json))

## Run ScandEval from script

## BiaSWE Evaluations

In [34]:
import json
#../data/BiaSWE-annotated-bias-gpt-sw3-6.7b-hopkok-v3-nosystem-DPO-Run-1.jsonl
#../data/BiaSWE-annotated-bias-gpt-sw3-6.7b-v2-instruct.jsonl
#../data/BiaSWE-annotated-bias-gpt-sw3-6.7b-hopkok-v3-nosystem.jsonl
path = "../data/BiaSWE-annotated-bias-gpt-sw3-6.7b-hopkok-v3-nosystem-DPO-Run-1.jsonl"

with open(path, 'r') as f:
    data = [json.loads(line) for line in f]

In [None]:
data[0]['annotations']['ground_truth']['hate_speech']

In [13]:
from sklearn.metrics import f1_score, accuracy_score

y_true_hate_speech = []
y_pred_hate_speech = []

y_true_misogyny = []
y_pred_misogyny = []

for d in data:
    annotations = d['annotations']
    y_true_hate_speech.append(annotations['ground_truth']['hate_speech'])
    y_pred_hate_speech.append(annotations['labels']['hate_speech'])

    y_true_misogyny.append(annotations['ground_truth']['misogyny'])
    y_pred_misogyny.append(annotations['labels']['misogyny'])


f1_hate_speech = f1_score(y_true_hate_speech, y_pred_hate_speech)
f1_misogyny = f1_score(y_true_misogyny, y_pred_misogyny)

accuracy_hate_speech = accuracy_score(y_true_hate_speech, y_pred_hate_speech)
accuracy_misogyny = accuracy_score(y_true_misogyny, y_pred_misogyny)

y_true_combined = [(y_true_hate_speech[i], y_true_misogyny[i]) for i in range(len(y_true_hate_speech))]
y_pred_combined = [(y_pred_hate_speech[i], y_pred_misogyny[i]) for i in range(len(y_pred_hate_speech))]

joint_accuracy = sum([1 for i in range(len(y_true_combined)) if y_true_combined[i] == y_pred_combined[i]]) / len(y_true_combined)

In [14]:
f1_hate_speech, f1_misogyny
macro_f1 = (f1_hate_speech + f1_misogyny) / 2


print(f"Accuracy for Hate Speech: {accuracy_hate_speech}")
print(f"Accuracy for Misogyny: {accuracy_misogyny}")
print(f"Joint Accuracy: {joint_accuracy}")
print(f"F1-Score for Hate Speech: {f1_hate_speech}")
print(f"F1-Score for Misogyny: {f1_misogyny}")
print(f"Macro-Averaged F1-Score: {macro_f1}")

Accuracy for Hate Speech: 0.4275555555555556
Accuracy for Misogyny: 0.5353333333333333
Joint Accuracy: 0.41555555555555557
F1-Score for Hate Speech: 0.016793893129770993
F1-Score for Misogyny: 0.0760053026955369
Macro-Averaged F1-Score: 0.046399597912653946


In [35]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def calculate_metrics(data):
    y_true_hate_speech = []
    y_pred_hate_speech = []
    y_true_misogyny = []
    y_pred_misogyny = []

    for d in data:
        annotations = d['annotations']
        y_true_hate_speech.append(annotations['ground_truth']['hate_speech'])
        y_pred_hate_speech.append(annotations['labels']['hate_speech'])

        y_true_misogyny.append(annotations['ground_truth']['misogyny'])
        y_pred_misogyny.append(annotations['labels']['misogyny'])

    f1_hate_speech = f1_score(y_true_hate_speech, y_pred_hate_speech)
    f1_misogyny = f1_score(y_true_misogyny, y_pred_misogyny)

    accuracy_hate_speech = accuracy_score(y_true_hate_speech, y_pred_hate_speech)
    accuracy_misogyny = accuracy_score(y_true_misogyny, y_pred_misogyny)

    y_true_combined = [(y_true_hate_speech[i], y_true_misogyny[i]) for i in range(len(y_true_hate_speech))]
    y_pred_combined = [(y_pred_hate_speech[i], y_pred_misogyny[i]) for i in range(len(y_pred_hate_speech))]

    joint_accuracy = sum([1 for i in range(len(y_true_combined)) if y_true_combined[i] == y_pred_combined[i]]) / len(y_true_combined)
    macro_f1 = (f1_hate_speech + f1_misogyny) / 2

    return {
        'f1_hate_speech': f1_hate_speech,
        'f1_misogyny': f1_misogyny,
        'accuracy_hate_speech': accuracy_hate_speech,
        'accuracy_misogyny': accuracy_misogyny,
        'joint_accuracy': joint_accuracy,
        'macro_f1': macro_f1
    }

num_runs = len(data) // 450
run_chunks = [data[i*450:(i+1)*450] for i in range(num_runs)]

# Store metrics for each run
metrics_list = []

for run_data in run_chunks:
    metrics = calculate_metrics(run_data)
    metrics_list.append(metrics)

# Compute average metrics
average_metrics = {
    'f1_hate_speech': np.mean([metrics['f1_hate_speech'] for metrics in metrics_list]),
    'f1_misogyny': np.mean([metrics['f1_misogyny'] for metrics in metrics_list]),
    'accuracy_hate_speech': np.mean([metrics['accuracy_hate_speech'] for metrics in metrics_list]),
    'accuracy_misogyny': np.mean([metrics['accuracy_misogyny'] for metrics in metrics_list]),
    'joint_accuracy': np.mean([metrics['joint_accuracy'] for metrics in metrics_list]),
    'macro_f1': np.mean([metrics['macro_f1'] for metrics in metrics_list])
}

# Print average metrics
for metric, average in average_metrics.items():
    print(f"Average {metric}: {average}")

Average f1_hate_speech: 0.6831262134969214
Average f1_misogyny: 0.6259955918326965
Average accuracy_hate_speech: 0.5748888888888889
Average accuracy_misogyny: 0.5173333333333334
Average joint_accuracy: 0.44333333333333336
Average macro_f1: 0.6545609026648089


In [None]:
# Calculate accuracy


