In [None]:
!pip install --quiet openai

import os
import json
from openai import OpenAI

os.environ["OPENAI_API_KEY"] = "sk-proj-[...]"

client = OpenAI()


LEGEND_MODEL_ID = "ft:gpt-4o-2024-08-06:sesar-lab:gpt-tuned-legend:Cae1R16c"
REG_MODEL_ID    = "ft:gpt-4o-2024-08-06:sesar-lab:gpt-tuned-reg:CaeYGbam"

#Nomi file di valutazione
SCENARIO_FILE = "gender_eval_scenario.jsonl"
BIAS_FILE     = "gender_eval_bias.jsonl"
STANCE_FILE   = "gender_eval_stance.jsonl"

print("Setup completato")


Setup completato. Client inizializzato.


In [None]:
def load_eval_rows(path):
    #Carica le righe JSONL e restituisce una lista di liste di messages
    rows = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            rows.append(obj["messages"])
    return rows

def gold_label(messages):
    #L'ultima entry (assistant) nel JSONL è la gold label.
    return messages[-1]["content"].strip()

def call_model(model_id, messages, temperature=0.0):
    #Chiamata al modello OpenAI/FineTuneDB
    response = client.chat.completions.create(
        model=model_id,
        messages=messages,
        temperature=temperature,
        max_tokens=50,
    )
    return response.choices[0].message.content.strip()


def norm_simple(text):
    return text.lower().strip().replace(".", "")

def norm_scenario(text):
    t = norm_simple(text)
    # accetta variazioni leggere
    if "non" in t and "compliant" in t:
        return "non-compliant"
    if "not compliant" in t:
        return "non-compliant"
    if "compliant" in t or "yes" in t:
        return "compliant"
    # fallback: prima parola
    return t.split()[0] if t else t

def norm_bias(text):
    t = norm_simple(text)
    if "biased" in t:
        return "biased"
    if "unbiased" in t or "no bias" in t:
        return "unbiased"
    return t.split()[0] if t else t

def norm_stance(text):
    t = norm_simple(text)
    if "support" in t or "pro " in t or "in favour" in t:
        return "supportive"
    if "against" in t or "oppose" in t or "contrary" in t:
        return "against"
    if "neutral" in t or "no strong opinion" in t:
        return "neutral"
    # se la risposta è solo l'etichetta va già bene
    return t.split()[0] if t else t

def accuracy(model_id, rows, normalizer_gold, normalizer_pred, verbose=False):
    correct = 0
    total = 0
    for i, msgs in enumerate(rows):
        gold = normalizer_gold(gold_label(msgs))
        pred_raw = call_model(model_id, msgs[:-1])  # senza gold label
        pred = normalizer_pred(pred_raw)

        #debug it -> se True
        if verbose:
            print(f"\nEsempio {i+1}")
            print("Gold:", gold)
            print("Pred:", pred_raw, "->", pred)

        if pred == gold:
            correct += 1
        total += 1
    return correct / total if total > 0 else 0.0


In [None]:
scenario_rows = load_eval_rows(SCENARIO_FILE)
bias_rows     = load_eval_rows(BIAS_FILE)
stance_rows   = load_eval_rows(STANCE_FILE)

print("ScenarioEval esempi:", len(scenario_rows))
print("BiasDetect esempi:", len(bias_rows))
print("Stance esempi:", len(stance_rows))


ScenarioEval esempi: 10
BiasDetect esempi: 10
Stance esempi: 10


In [None]:
results = {}

print("Valutazione Legend_tuned...")
results["Legend_tuned"] = {
    "ScenarioEval_acc": accuracy(LEGEND_MODEL_ID, scenario_rows,
                                 normalizer_gold=norm_scenario,
                                 normalizer_pred=norm_scenario),
    "BiasDetect_acc":   accuracy(LEGEND_MODEL_ID, bias_rows,
                                 normalizer_gold=norm_bias,
                                 normalizer_pred=norm_bias),
    "Stance_acc":       accuracy(LEGEND_MODEL_ID, stance_rows,
                                 normalizer_gold=norm_stance,
                                 normalizer_pred=norm_stance),
}

print("Valutazione Reg_tuned...")
results["Reg_tuned"] = {
    "ScenarioEval_acc": accuracy(REG_MODEL_ID, scenario_rows,
                                 normalizer_gold=norm_scenario,
                                 normalizer_pred=norm_scenario),
    "BiasDetect_acc":   accuracy(REG_MODEL_ID, bias_rows,
                                 normalizer_gold=norm_bias,
                                 normalizer_pred=norm_bias),
    "Stance_acc":       accuracy(REG_MODEL_ID, stance_rows,
                                 normalizer_gold=norm_stance,
                                 normalizer_pred=norm_stance),
}

print("\n=== RISULTATI GENDERGLUE ===")
for model_name, scores in results.items():
    print(model_name)
    for task, acc in scores.items():
        print(f"  {task}: {acc:.3f}")


Valutazione Legend_tuned...
Valutazione Reg_tuned...

=== RISULTATI GENDERGLUE ===
Legend_tuned
  ScenarioEval_acc: 0.000
  BiasDetect_acc: 0.400
  Stance_acc: 0.900
Reg_tuned
  ScenarioEval_acc: 0.500
  BiasDetect_acc: 0.100
  Stance_acc: 1.000
