In [26]:
import argparse
import os, json, re
import pandas as pd

# Helper Functions

In [42]:
### Load Grount Truths using prompts file
def load_gts(model_responses_file):
    gt_values_list_format = {}
    ### open jsonl file
    with open(model_responses_file, "r") as f:
        for line in f:
            data = json.loads(line)
            gt_value_unproccessed = data["gt"]
            ground_truth = re.sub(r"(?<=\d),(?=\d)", "", gt_value_unproccessed)
            ground_truth = ground_truth.lower().strip()
            ground_truth = [x.lower().strip() for x in ground_truth.replace("{", "").replace("}", "").replace("||", "|").split("|")]
            
            for j in range(len(ground_truth)): # Fix the commas in numbers problem
                temp_val = ground_truth[j]
                for _ in range(5):
                    temp_val = re.sub(r'(\d+),(\d+)', r'\1\2', temp_val)
                ground_truth[j] = temp_val

            gt_values_list_format[data["qap_id"]] = data["gt"]
    
    return gt_values_list_format


### Calculate MC Score - Mean Containment
def calculate_containment_score(response, gt_tokens):
    match_count = sum([1 for token in gt_tokens if token in response])
    if len(gt_tokens) == 0:
        return 0
    return float(match_count / len(gt_tokens))


### Clean model response if verbosse
def post_process_response(response: str) -> str:
    response = response.lower().strip()
    # Define the regex pattern to match any of the junk tokens
    junk_tokens = r"(?:<eos_token>|#input|# input|# solution:|#solution|# explanation:|#explanation|note:|table:)"
    # Find the position of the first junk token
    match = re.search(junk_tokens, response)
    # If a junk token is found, truncate the response before it
    if match:
        response = response[:match.start()]
    
    ### Check for double new line (we see this trend quite often before models start yapping)
    response = response.split("\n\n")[0]
    reponse = response.split("\n```\n")[0]

    # Remove '`' characters
    response = response.replace("`", "").replace("\n","")

    # Replace "," in numbers with "" (example: 1,000 -> 1000 & 1,232.23 -> 1232.23 & |1,000,000 -> |1000000)
    for i in range(5):
        response = re.sub(r'(\d+),(\d+)', r'\1\2', response)
    
    return response.lower().strip()

### Main function for scoring
def score_model_responses(model_responses_file, promtp_type = "generic"):
    results = []
    error_ids = []
    try:
        gts = load_gts(model_responses_file)
    except:
        gts = load_gts("/export/home/mshahmmer/hcsd_github_reformat/latest_inf_analysis/realWorld_HCT-all_prompts.jsonl")


    # Load model_responses_file as jsonl
    with open(model_responses_file, 'r') as file:
        all_responses = [json.loads(line) for line in file]

    qap_ids_with_missing_gt = []

    # Process each response
    for record in all_responses:
        qap_id = record["qap_id"].split("&")[0]
        model_name = model_responses_file.split("--")[1]
        dataset_name = record["qap_id"].split("--")[0]

        if qap_id in gts:
            if record["response"]  == None:
                record["response"] =  ""
            raw_response = record["response"].lower().strip()
            cleaned_response = post_process_response(record["response"]).lower().strip()

            gt_t = [x.lower().strip() for x in gts[qap_id]]

            raw_containment_score = calculate_containment_score(raw_response, gt_t)
            raw_is_complete_containment = 1 if raw_containment_score == 1.0 else 0

            cleaned_containment_score = calculate_containment_score(cleaned_response, gt_t)
            cleaned_is_complete_containment = 1 if cleaned_containment_score == 1.0 else 0

            results.append({
                "qap_id": qap_id,
                "model_name": model_name,
                "dataset_name": dataset_name,
                "raw_response": raw_response,
                "cleaned_response": cleaned_response,
                "prompt_type" : promtp_type,
                "gt": gt_t,
                "MC_SCORE": cleaned_containment_score,
                "CC_SCORE": cleaned_is_complete_containment,
                "raw_response_length": len(raw_response.split()),
                "cleaned_response_length": len(cleaned_response.split()),
                # decrease in token count after cleaning
                "perc_junk_tokens_in_raw_response": ((len(raw_response) - len(cleaned_response))*100) / (len(raw_response)+1)
            })
        else:
            qap_ids_with_missing_gt.append(record["qap_id"])

    return pd.DataFrame(results), qap_ids_with_missing_gt

# MAIN

### SET USER INPUTS

In [None]:
### Give path to where all model responses that are to be scored are saved
model_responses_folder = "../results/model_responses/"

### Give path to where all scores files are to be saved
scores_folder = "../results/scores/"
# scores_folder = ""

### *OPTIONAL* Prompt type if specfic one is to provided
prompt_type = "generic"

In [48]:
### GIT TEST
model_name_to_results_df = {}
for fname in os.listdir(model_responses_folder):
    print(fname)
    model_name_t = fname.split("--")[0]    
    model_response_file_t = os.path.join(model_responses_folder, fname)
    results_df_t, missing_gt_ids = score_model_responses(model_response_file_t)
    print(f"Model: {model_name_t} - Missing GTs: {len(missing_gt_ids)}")
    print(f"Model: {model_name_t} - Results DF Shape: {results_df_t.shape}")
    model_name_to_results_df[model_name_t] = results_df_t



results--Meta-Llama-3.1-8B-Instruct--realWorld_HCT-all_promptsl.jsonl
Model: results - Missing GTs: 0
Model: results - Results DF Shape: (9835, 12)


In [None]:
all_results_df = pd.concat([v for k,v in model_name_to_results_df.items()])
all_results_df = all_results_df.reset_index(drop=True)
print(all_results_df.shape)

### Collapse on model_name per dataset_name level
all_results_df_grouped_by_model_dataset = all_results_df.groupby(["model_name", "dataset_name"]).agg({
    "MC_SCORE": "mean",
    "CC_SCORE": "mean",
    "raw_response_length": "mean",
    "cleaned_response_length": "mean",
    "perc_junk_tokens_in_raw_response": "mean"
}).reset_index()

all_results_df_grouped_by_model_dataset.to_csv(os.path.join(scores_folder, f"{model_responses_folder.split('/')[-1]}--perModelDataset-scores.csv"), index=False) 

### Collapse on dataset_name
all_results_df_grouped_by_model = all_results_df.groupby(["model_name"]).agg({
    "MC_SCORE": "mean",
    "CC_SCORE": "mean",
    "raw_response_length": "mean",
    "cleaned_response_length": "mean",
    "perc_junk_tokens_in_raw_response": "mean"
}).reset_index()

all_results_df_grouped_by_model.to_csv(os.path.join(scores_folder, f"{model_responses_folder.split('/')[-1]}--perModel-scores.csv"), index=False) 



(9835, 12)


#### Additonal Collapsed Results for Format Variation Scoring ONLY

In [None]:
### Collapse on Model - Dataset - Format Type
all_results_df_grouped_by_model_dataset_prompt = all_results_df.groupby(["model_name", "dataset_name", "prompt_type"]).agg({
    "MC_SCORE": "mean",
    "CC_SCORE": "mean",
    "raw_response_length": "mean",
    "cleaned_response_length": "mean",
    "perc_junk_tokens_in_raw_response": "mean"
}).reset_index()

all_results_df_grouped_by_model_dataset_prompt.to_csv(os.path.join(scores_folder, f"{model_responses_folder.split('/')[-1]}--perModelDatasetFormat--formatVariation-scores.csv"), index=False) 

### Collapse on Model - Format Type
all_results_df_grouped_by_model_prompt = all_results_df.groupby(["model_name", "prompt_type"]).agg({
    "MC_SCORE": "mean",
    "CC_SCORE": "mean",
    "raw_response_length": "mean",
    "cleaned_response_length": "mean",
    "perc_junk_tokens_in_raw_response": "mean"
}).reset_index()

all_results_df_grouped_by_model_prompt.to_csv(os.path.join(scores_folder, f"{model_responses_folder.split('/')[-1]}--perModelFormat--formatVariation-scores.csv"), index=False) 

### Collapse on Format Type
all_results_df_grouped_by_prompt = all_results_df.groupby(["prompt_type"]).agg({
    "MC_SCORE": "mean",
    "CC_SCORE": "mean",
    "raw_response_length": "mean",
    "cleaned_response_length": "mean",
    "perc_junk_tokens_in_raw_response": "mean"
}).reset_index()

all_results_df_grouped_by_model.to_csv(os.path.join(scores_folder, f"{model_responses_folder.split('/')[-1]}--perFormat--formatVariation-scores.csv"), index=False) 
