### RQ1: How does an LLM-driven synthetic guideline affect diagnostic accuracy through clinical reasoning?

In [1]:
import os
import json
import pandas as pd

# Directory path containing JSONL files
directory = "results"

# Initialize an empty list to store data
data = []

# Read only files ending with 'eval_results-clf.jsonl' within the directory
for filename in filter(lambda f: f.endswith("eval_results-clf.jsonl"), os.listdir(directory)):
    filepath = os.path.join(directory, filename)
    
    # Extract model name (portion between 'snsb-' and '.eval_results.jsonl')
    test_type, *model_name_parts = filename.split('.eval_results')[0].split('-')
    model_name = "-".join(model_name_parts)
    
    with open(filepath, 'r') as file:
        for line in file:
            # Parse JSON data
            json_data = json.loads(line.strip())
            # Add extracted test type and model name to the data
            json_data['type'] = test_type
            json_data['model'] = model_name
            data.append(json_data)

# Convert the collected data into a DataFrame
df = pd.DataFrame(data)

# Convert accuracy, precision, sensitivity, specificity, and F1-score to percentages
for metric in ['accuracy', 'sensitivity', 'specificity', 'precision', 'f1_score']:
    df[metric] *= 100

# Mapping of model names for better readability
model_name_mapping = {
    "llama3.3:70b": "Llama 3.3 (70B)",
    "phi4:14b": "Phi 4 (14B)",
    "llama3.1:8b": "Llama 3.1 (8B)",
    "deepseek-r1:70b": "DeepSeek-R1 (70B)",
    "deepseek-r1:32b": "DeepSeek-R1 (32B)",
    "deepseek-r1:14b": "DeepSeek-R1 (14B)",
}
df['model'] = df['model'].replace(model_name_mapping)

# Specify the desired order of model names and test types
model_order = [
    "Llama 3.3 (70B)", "Phi 4 (14B)", "Llama 3.1 (8B)",
    "DeepSeek-R1 (70B)", "DeepSeek-R1 (32B)", "DeepSeek-R1 (14B)"
]
type_order = ["zeroshot", "fewshot1", "fewshot2", "custom"]

# Sort the DataFrame by model and test type
df['model'] = pd.Categorical(df['model'], categories=model_order, ordered=True)
df['type'] = pd.Categorical(df['type'], categories=type_order, ordered=True)

# Calculate average accuracy by model and test type
df_clf = df.groupby(['model', 'type'], observed=True).mean()

# Display relevant evaluation metrics
df_clf[['accuracy', 'sensitivity', 'specificity', 'precision', 'f1_score']]

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,sensitivity,specificity,precision,f1_score
model,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Llama 3.3 (70B),zeroshot,80.0,100.0,53.57,74.0,85.06
Llama 3.3 (70B),fewshot1,83.08,91.89,71.43,80.95,86.08
Llama 3.3 (70B),fewshot2,81.54,91.89,67.86,79.07,85.0
Llama 3.3 (70B),custom,89.23,94.59,82.14,87.5,90.91
Phi 4 (14B),zeroshot,75.38,91.89,53.57,72.34,80.95
Phi 4 (14B),fewshot1,72.31,83.78,57.14,72.09,77.5
Phi 4 (14B),fewshot2,70.77,86.49,50.0,69.57,77.11
Phi 4 (14B),custom,81.54,91.89,67.86,79.07,85.0
Llama 3.1 (8B),zeroshot,60.0,89.19,21.43,60.0,71.74
Llama 3.1 (8B),fewshot1,70.77,97.3,35.71,66.67,79.12


### RQ2: How does an LLM-driven synthetic guideline impact on the quality of rationales?

In [2]:
import os
import pandas as pd
import json

# Directory containing JSONL files
directory = "results/"

# Initialize an empty list to store data
collected_data = []

# Iterate through files in the directory, processing only those ending with 'eval_results-rubric.jsonl'
for filename in os.listdir(directory):
    if filename.endswith("eval_results-rubric.jsonl"):  # Process only files matching the pattern
        filepath = os.path.join(directory, filename)
        
        # Extract model name and data type from the filename
        file_parts = filename.split('.eval_results')[0].split('-')
        data_type = file_parts[0]  # Extract the data type (first part)
        model_name = "-".join(file_parts[1:])  # Extract the model name (remaining parts)
        
        # Read and process the JSONL file
        with open(filepath, 'r') as file:
            for line in file:
                json_data = json.loads(line.strip())  # Parse JSON data
                json_data['data'] = data_type  # Add extracted data type
                json_data['model'] = model_name  # Add extracted model name
                collected_data.append(json_data)

# Convert collected data into a DataFrame
df = pd.DataFrame(collected_data)

# Mapping of model names for better readability
model_name_mapping = {
    "llama3.3:70b": "Llama 3.3 (70B)",
    "phi4:14b": "Phi 4 (14B)",
}
df['model'] = df['model'].replace(model_name_mapping)

score_mapping = {
    "A": "custom",
    "B": "fewshot"
}
df['score'] = df['score'].replace(score_mapping)

# Reorder the columns based on the specified order
criterion_order = ['consistency', 'correctness', 'specificity', 'helpfulness', 'humanlikeness']

# Sort the DataFrame by the specified order
df['criteria'] = pd.Categorical(df['criteria'], categories=criterion_order, ordered=True)

# Group data by 'model', and 'criteria', calculate normalized value counts of 'score'
result = df.groupby(['model', 'criteria'], observed=True)['score'].value_counts(normalize=True).unstack()

# Format the result as percentages
formatted_result = result.map(lambda x: f'{x:.2%}')

# Display the formatted DataFrame
formatted_result

Unnamed: 0_level_0,score,custom,fewshot
model,criteria,Unnamed: 2_level_1,Unnamed: 3_level_1
Llama 3.3 (70B),consistency,83.08%,16.92%
Llama 3.3 (70B),correctness,80.00%,20.00%
Llama 3.3 (70B),specificity,86.15%,13.85%
Llama 3.3 (70B),helpfulness,81.54%,18.46%
Llama 3.3 (70B),humanlikeness,84.62%,15.38%
Phi 4 (14B),consistency,66.15%,33.85%
Phi 4 (14B),correctness,69.23%,30.77%
Phi 4 (14B),specificity,76.92%,23.08%
Phi 4 (14B),helpfulness,73.85%,26.15%
Phi 4 (14B),humanlikeness,73.85%,26.15%
