In [3]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install unsloth
# Get latest Unsloth
!pip install --upgrade --no-deps "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

## Initializing the Model

In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2024.12.8: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

==((====))==  Unsloth 2024.12.8: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 7.5. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.12.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Dataset Preparation :
Loading the question_with_options along with the predicted_label and the gpt_reasoning information as a csv.

## Dataset Preparation

In [7]:
from datasets import load_dataset
dataset = load_dataset(
    "csv",
    data_files = "dataset.csv",
    split = "train",
)
print(dataset.column_names)
print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

['questions_with_options', 'predicted_label', 'gpt_reasoning']
{'questions_with_options': "A 58-year-old man comes to the physician for a 3-month history of progressive shortness of breath on exertion and tiredness throughout the day. His wife reports that he snores at night and that he sometimes chokes in his sleep. He has a history of hypertension treated with enalapril. His blood pressure is 149/96 mmHg. There is jugular venous distention and 2+ lower extremity edema bilaterally. The lungs are clear to auscultation bilaterally. An ECG shows right axis deviation. Which of the following is the most likely underlying cause of this patient's condition?\n \n\n These are the options:\n \n\n A\n Coronary artery disease\n  B\n Chronic hypoxia\n  C\n Left ventricular hypertrophy\n  D\n Alveolar destruction\n  E\n Hypertensive nephropathy\n \n\n The correct answer is\n B: Chronic hypoxia", 'predicted_label': 0, 'gpt_reasoning': "determine the difficulty. The question presents a clinical scena

## Converting the dataset into required format:

In [8]:
from unsloth import to_sharegpt

# Define a post-processing function to combine 'predicted_label' and 'reasoning'
def combine_columns(example):
    example["predicted_label_and_reasoning"] = f"Label: {example['predicted_label']}, Reasoning: {example['gpt_reasoning']}"
    return example

# Apply the function to the dataset
dataset = dataset.map(combine_columns)

# Verify the formatted dataset
print(dataset[0])


# Merge "predicted_label" and "reasoning" into the response in the merged_prompt
dataset = to_sharegpt(
    dataset,
    merged_prompt="{questions_with_options}",
    output_column_name="predicted_label_and_reasoning",  # Combine predicted_label and reasoning
)


print(dataset[0])


Map:   0%|          | 0/542 [00:00<?, ? examples/s]

{'questions_with_options': "A 58-year-old man comes to the physician for a 3-month history of progressive shortness of breath on exertion and tiredness throughout the day. His wife reports that he snores at night and that he sometimes chokes in his sleep. He has a history of hypertension treated with enalapril. His blood pressure is 149/96 mmHg. There is jugular venous distention and 2+ lower extremity edema bilaterally. The lungs are clear to auscultation bilaterally. An ECG shows right axis deviation. Which of the following is the most likely underlying cause of this patient's condition?\n \n\n These are the options:\n \n\n A\n Coronary artery disease\n  B\n Chronic hypoxia\n  C\n Left ventricular hypertrophy\n  D\n Alveolar destruction\n  E\n Hypertensive nephropathy\n \n\n The correct answer is\n B: Chronic hypoxia", 'predicted_label': 0, 'gpt_reasoning': "determine the difficulty. The question presents a clinical scenario involving a 58-year-old man with symptoms suggestive of rig

Merging columns:   0%|          | 0/542 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/542 [00:00<?, ? examples/s]

{'conversations': [{'from': 'human', 'value': '("A 58-year-old man comes to the physician for a 3-month history of progressive shortness of breath on exertion and tiredness throughout the day. His wife reports that he snores at night and that he sometimes chokes in his sleep. He has a history of hypertension treated with enalapril. His blood pressure is 149/96 mmHg. There is jugular venous distention and 2+ lower extremity edema bilaterally. The lungs are clear to auscultation bilaterally. An ECG shows right axis deviation. Which of the following is the most likely underlying cause of this patient\'s condition?\\n \\n\\n These are the options:\\n \\n\\n A\\n Coronary artery disease\\n  B\\n Chronic hypoxia\\n  C\\n Left ventricular hypertrophy\\n  D\\n Alveolar destruction\\n  E\\n Hypertensive nephropathy\\n \\n\\n The correct answer is\\n B: Chronic hypoxia",)'}, {'from': 'gpt', 'value': "Label: 0, Reasoning: determine the difficulty. The question presents a clinical scenario involvi

## Standardize share_gpt

In [9]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)

Standardizing format:   0%|          | 0/542 [00:00<?, ? examples/s]

## Chat Template

In [10]:
chat_template = """Below are questions with options. Provide the predicted label and reasoning.

>>> Question:
{INPUT}

>>> Answer:
{OUTPUT}

"""

from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
    # default_system_message = "You are a helpful assistant", << [OPTIONAL]
)

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/542 [00:00<?, ? examples/s]

## Training the model
<a name="Train"></a>
Defining the training parametes and using Huggingface TRL's `SFTTrainer`!

In [13]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, ## comment this later
        num_train_epochs = 1, ## train for 5 epochs ideally
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 542 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
1,1.7442
2,1.7926
3,1.7684
4,1.6439
5,1.6429
6,1.6374
7,1.4489
8,1.3693
9,1.2016
10,1.1115


## Saving the fine-tune model

In [15]:
# Step 9: Save the fine-tuned model
model.save_pretrained("fine_tuned_llama")
tokenizer.save_pretrained("fine_tuned_llama_tokenizer")


('fine_tuned_llama_tokenizer/tokenizer_config.json',
 'fine_tuned_llama_tokenizer/special_tokens_map.json',
 'fine_tuned_llama_tokenizer/tokenizer.json')

## Performing Inference using the Excel file containing the questions and options

In [35]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("inference.csv")

# Display the first few rows of the DataFrame
print(df.head())

top_5_df = df.head(5)




                              questions_with_options  ground_truth_label  \
0  A 60-year-old man is brought to the emergency ...                   0   
1  A previously healthy 19-year-old man is brough...                   0   
2  A 23-year-old woman is brought to the emergenc...                   0   
3  Prior to undergoing a total knee arthroplasty,...                   0   
4  A 28-year-old woman comes to the physician bec...                   0   

                                       gpt_reasoning  
0  The question is considered to have a difficult...  
1  The question is considered to have a difficult...  
2  The question is considered to have a difficult...  
3  The question is considered to have a difficult...  
4  The question is considered to have a difficult...  


In [42]:
# Define the additional string that will be added to each content
additional_content = (
    "Please provide the output in the following format:\n\n"
    "Response:\n"
    "Difficulty Level: 0 or 1 (depending on difficulty)\n"
    "Reasoning: \"<Reasoning text>\"\n"
)



# Prepare the messages for inference by extracting the 'question with options' column
messages = [{"role": "user", "content": row["questions_with_options"] + additional_content} for _, row in top_5_df.iterrows()]

In [58]:
from transformers import TextStreamer

FastLanguageModel.for_inference(model)  # Enable faster inference

responses = []  # To store model responses

# Define regex patterns to extract the label and reasoning
label_pattern = r"Label: (\d)"
reasoning_pattern = r"Reasoning: (.*)"

text_streamer = TextStreamer(tokenizer, skip_prompt=True)  # For streaming output

# Generate responses for each message
for idx, row in top_5_df.iterrows():
    # Prepare the message by adding additional content
    additional_content = (
        "Please provide the output in the following format:\n\n"
        "Response:\n"
        "Difficulty Level: 0 or 1 (depending on difficulty)\n"
        "Reasoning: \"<Reasoning text>\""
    )
    message = {"role": "user", "content": row["questions_with_options"] + additional_content}

    input_ids = tokenizer.apply_chat_template(
        [message],  # Each message as input
        add_generation_prompt=True,
        return_tensors="pt",
    ).to("cuda")

    generated_output = model.generate(
        input_ids,
        streamer=text_streamer,
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated output
    response_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    # Use regex to extract the predicted label and reasoning
    label_match = re.search(label_pattern, response_text)
    reasoning_matches = re.findall(reasoning_pattern, response_text)

    # Extract label and reasoning
    predicted_label = label_match.group(1) if label_match else None
    reasoning = reasoning_matches[1] if len(reasoning_matches) > 1 else None

    # Append the response and additional info to the responses list
    responses.append({
        "difficulty": row["ground_truth_label"],  # Ground truth difficulty
        "difficulty_prediction": predicted_label,  # Predicted difficulty
        "reasoning": row["gpt_reasoning"],    # Ground truth reasoning
        "reasoning_prediction": reasoning  # Predicted reasoning
    })

# Convert responses to a DataFrame
responses_df = pd.DataFrame(responses)

Label: 0, Reasoning: determine the difficulty. The question presents a clinical scenario involving a patient with fever, confusion, and elevated creatine kinase levels, which are indicative of a serious condition. The options provided include various pharmacotherapies, but the correct answer, dantrolene, is a specific medication used to treat malignant hyperthermia, a life-threatening condition that requires immediate recognition and treatment. This question requires knowledge of pharmacology and the ability to recognize the implications of the patient's symptoms, making it more challenging than basic clinical scenarios. Therefore, this question is likely to be considered more difficult for a medical student.

<|end_of_text|>
Label: 0, Reasoning: determine the difficulty. The question presents a clinical scenario involving a young man with syncope, which is a common presentation in emergency medicine. The key features include the brief loss of consciousness, the absence of chest pain o

In [59]:
# Convert responses to a DataFrame
responses_df = pd.DataFrame(responses)
responses_df

Unnamed: 0,difficulty,difficulty_prediction,reasoning,reasoning_prediction
0,0,0,The question is considered to have a difficult...,determine the difficulty. The question present...
1,0,0,The question is considered to have a difficult...,determine the difficulty. The question present...
2,0,1,The question is considered to have a difficult...,determine the difficulty. The question present...
3,0,1,The question is considered to have a difficult...,determine the difficulty. The question involve...
4,0,0,The question is considered to have a difficult...,determine the difficulty. The question present...


## Performing Inference on a single data record

In [17]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# Define the input message
messages = [
    {"role": "user", "content": (
        'A 62-year-old man comes to the physician because of hematemesis and progressive heartburn over the past 5 days. '
        'Ten days ago, he was started on a medication to treat a condition that causes hearing difficulties and pain in the lower extremities. '
        'He has no other history of serious illness. He has smoked one pack of cigarettes daily for the past 20 years. '
        'Vital signs are within normal limits. Physical examination shows bowing of the tibias. '
        'Upper endoscopy shows inflammation of the mucosa and a 1-cm punched-out ulcer in the distal esophagus. '
        'Which of the following drugs is the most likely cause of the patient\'s current condition?\n\n'
        'These are the options:\n\n'
        'A. Calcium citrate\n'
        'B. Denosumab\n'
        'C. Risedronate\n'
        'D. Calcitonin\n'
        'E. Prednisolone\n'
        'F. Acetaminophen\n\n'
        'Please provide the output in the following format:\n\n'
        'Response:\n'
        'Difficulty Level: 0 or 1 (depending on difficulty)\n'
        'Reasoning: "<Reasoning text>"'
    )}
]

# Apply the chat template and convert to input IDs
input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

# Set up a streamer for output generation
from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

# Generate the response
generated_output = model.generate(
    input_ids,
    streamer=text_streamer,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

Label: 1, Reasoning: determine the difficulty. The question presents a clinical scenario involving a patient with hematemesis and esophageal ulcers, which is a common presentation for esophageal varices or other gastrointestinal conditions. The options provided include various medications, but the key detail is the patient's history of smoking and the recent initiation of a medication for hearing and pain. This information suggests that the most likely cause of the patient's current condition is related to the medication he was recently started on, which is typically used to treat osteoporosis. This is a fundamental concept in pharmacology and clinical reasoning that medical students are expected to understand. While the question requires some clinical reasoning, it is based on common knowledge and is likely to be straightforward for a medical student.

<|end_of_text|>


In [29]:
import re
import pandas as pd

# Decode the generated output to a string
decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)

# Define regex patterns to extract the label and reasoning
label_pattern = r"Label: (\d)"
reasoning_pattern = r"Reasoning: (.*)"

# Use regex to extract the predicted label and reasoning
label_match = re.search(label_pattern, decoded_output)
reasoning_matches = re.findall(reasoning_pattern, decoded_output)


# Extract the values
predicted_label = label_match.group(1) if label_match else None
reasoning = reasoning_matches[1] if len(reasoning_matches) > 1 else None

print(predicted_label)


# Store the extracted values in a pandas DataFrame
df = pd.DataFrame([{
    "difficulty_prediction": predicted_label,
    "reasoning_prediction": reasoning
}])

df

1


Unnamed: 0,difficulty_prediction,reasoning_prediction
0,1,determine the difficulty. The question present...


In [None]:
parse_output(generated_text)

## Evaluation Metrics and pre-defined function

Function name: evaluate_difficulty_and_reasoning
Function Path: project/pi_hongyu_umass_edu/zonghai/6_usmle_qg_quality_difficulty/anirudh/bleurt/evaluation_script_diff_reasonging.py
Usage Example:

```
###Testing
import pandas as pd

# Sample data
data = {
    "difficulty": [1, 2, 1, 3],  # Ground truth difficulty levels
    "difficulty_prediction": [1, 2, 1, 2],  # Predicted difficulty levels
    "reasoning": [
        "The questionnaire is simple and does not require detailed medical history.",
        "The questionnaire involves multiple questions about medical history.",
        "The questions are straightforward and easy to answer.",
        "This requires detailed medical history and is complex."
    ],  # Reference reasoning
    "reasoning_prediction": [
        "The questionnaire appears
 simple with no detailed medical requirements.",
        "It includes many questions about past medical history, suggesting complexity.",
        "Straightforward questions make this easy to complete.",
        "Requires extensive medical history, making it complex."
    ]  # Predicted reasoning
}

*italicized text*

df = pd.DataFrame(data)
```

```
Sample Output:
{'difficulty_metrics': {'accuracy': 0.75,
  'f1': 0.6666666666666666,
  'precision': 0.625,
  'recall': 0.75,
  'confusion_matrix': [[2, 0, 0], [0, 1, 0], [0, 1, 0]]},
 'reasoning_metrics': {'bert_score': {'precision': 0.9242371618747711,
   'recall': 0.9268922507762909,
   'f1': 0.9255314916372299},
  'meteor_score': 0.46955730750248714},
 'llm_judge_scores': {'mean_score': 2.75, 'scores': [3.0, 3.0, 2.0, 3.0]}}
 ```

In [55]:
!pip install bert-score
!pip install nltk
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')



Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [56]:
import requests
import re
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from bert_score import score
from nltk.translate import meteor
from nltk.tokenize import word_tokenize

def get_openai_response(prompt, model):
    api_key = "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"
    org_key = "org-ewbrRzXdrHxv7hV0WyCFzGdD"

    """
    Sends a prompt to OpenAI's API and retrieves the response.

    Args:
        prompt (str): The prompt for the LLM.
        model (str): The model to use (e.g., 'gpt-4').
        api_key (str): OpenAI API key.
        org_key (str): OpenAI organization key.

    Returns:
        str: The response text from the LLM.
    """
    url = 'https://api.openai.com/v1/chat/completions'
    headers = {
        'Content-Type': 'application/json',
        'Authorization': f'Bearer {api_key}',
        'OpenAI-Organization': org_key
    }
    data = {
        'messages': [{'role': 'system', 'content': prompt}],
        'model': model,
        'temperature': 0.0
    }
    response = requests.post(url, headers=headers, json=data)
    if response.status_code == 200:
        return response.json()['choices'][0]['message']['content']
    else:
        raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

def llm_as_a_judge_prompt(conversation, reasoning_summary):
    """
    Generates a prompt for the LLM-as-a-judge evaluation.

    Args:
        conversation (str): The conversation or context text.
        reasoning_summary (str): The reasoning summary text.

    Returns:
        str: The generated prompt.
    """
    prompt = f"""### Instruction: Evaluate the reasoning for predicting the difficulty of medical questionnaires based on the conversation.

### Scoring Criteria:

**Case 1: Simple Questionnaire (Low Difficulty)**
- **2 points** if the reasoning clearly indicates that the questionnaire is simple with few questions, minimal medical history required, or if the conversation suggests an easy-to-understand questionnaire.
- **1 point** if the reasoning indicates that the questionnaire might be simple, but lacks clarity or supporting evidence from the conversation.
- **0 points** if no reasoning is provided or it contradicts the idea of simplicity.

**Case 2: Complex Questionnaire (High Difficulty)**
- **2 points** if the reasoning clearly indicates that the questionnaire is complex with multiple questions, detailed medical history required, or if the conversation suggests a high level of detail needed from the patient.
- **1 point** if the reasoning indicates that the questionnaire might be complex, but lacks enough supporting evidence from the conversation.
- **0 points** if no reasoning is provided or it contradicts the idea of complexity.

**General Evaluation Criteria:**
- **Clarity and Coherence**: 0.5 points for clear, well-structured reasoning.
- **Relevance**: 0.5 points if the reasoning is relevant to predicting the difficulty of the questionnaire based on the conversation.
- **Accuracy**: 1 point if the difficulty prediction aligns with the conversation content.

### Input:
- **Conversation**:
{conversation}

- **Summary (Reasoning for difficulty prediction)**:
{reasoning_summary}

### Output:
- "score: <total points>"
- Briefly justify your score, up to 50 words.
"""
    return prompt

def evaluate_difficulty_and_reasoning(df, model):
    """
    Evaluates the dataframe, including `LLM as a judge` metric.

    Args:
        df (pd.DataFrame): Input dataframe with columns for difficulty, predictions, and reasoning.
        model (str): The LLM model to use (e.g., 'gpt-4').
        api_key (str): OpenAI API key.
        org_key (str): OpenAI organization key.

    Returns:
        dict: A dictionary of metrics for difficulty, reasoning, and LLM as a judge.
    """
    # Validate required columns
    required_columns = ['difficulty', 'difficulty_prediction', 'reasoning', 'reasoning_prediction']
    if not all(col in df.columns for col in required_columns):
        raise ValueError(f"Dataframe must contain the following columns: {required_columns}")

    # Metrics for difficulty predictions
    difficulty_metrics = {
        "accuracy": accuracy_score(df['difficulty'], df['difficulty_prediction']),
        "f1": f1_score(df['difficulty'], df['difficulty_prediction'], average='weighted'),
        "precision": precision_score(df['difficulty'], df['difficulty_prediction'], average='weighted'),
        "recall": recall_score(df['difficulty'], df['difficulty_prediction'], average='weighted'),
        "confusion_matrix": confusion_matrix(df['difficulty'], df['difficulty_prediction']).tolist()
    }

    # Metrics for reasoning predictions
    reasoning_metrics = {
        "bert_score": {"precision": [], "recall": [], "f1": []},
        "meteor_score": []
    }
    llm_judge_scores = []
    for _, row in df.iterrows():
        ref = row['reasoning']
        pred = row['reasoning_prediction']
        # Compute BERTScore
        P, R, F1 = score([pred], [ref], lang='en', verbose=False)
        reasoning_metrics["bert_score"]["precision"].append(P.mean().item())
        reasoning_metrics["bert_score"]["recall"].append(R.mean().item())
        reasoning_metrics["bert_score"]["f1"].append(F1.mean().item())

        # Compute METEOR
        meteor_score_value = meteor([word_tokenize(ref)], word_tokenize(pred))
        reasoning_metrics["meteor_score"].append(meteor_score_value)

        # Generate LLM-as-a-judge prompt
        prompt = llm_as_a_judge_prompt(ref, pred)
        try:
            llm_response = get_openai_response(prompt, model)
            llm_score = extract_score_from_llm_response(llm_response)
        except Exception as e:
            print(f"Error in LLM scoring: {e}")
            llm_score = None
        llm_judge_scores.append(llm_score)

    # Aggregate BERTScore and METEOR
    reasoning_metrics["bert_score"] = {
        "precision": np.mean(reasoning_metrics["bert_score"]["precision"]),
        "recall": np.mean(reasoning_metrics["bert_score"]["recall"]),
        "f1": np.mean(reasoning_metrics["bert_score"]["f1"])
    }
    reasoning_metrics["meteor_score"] = np.mean(reasoning_metrics["meteor_score"])

    # Combine all metrics
    return {
        "difficulty_metrics": difficulty_metrics,
        "reasoning_metrics": reasoning_metrics,
        "llm_judge_scores": {
            "mean_score": np.nanmean(llm_judge_scores),
            "scores": llm_judge_scores
        }
    }

def extract_score_from_llm_response(response):
    """
    Extracts the score from LLM response text.

    Args:
        response (str): The text response from the LLM.

    Returns:
        float: The extracted score.
    """
    pattern = r"score:\s*(\d+(\.\d+)?)"
    match = re.search(pattern, response.lower())
    if match:
        return float(match.group(1))
    else:
        return None


In [53]:
data = {
    "difficulty": [1, 2, 1, 3],  # Ground truth difficulty levels
    "difficulty_prediction": [1, 2, 1, 2],  # Predicted difficulty levels
    "reasoning": [
        "The questionnaire is simple and does not require detailed medical history.",
        "The questionnaire involves multiple questions about medical history.",
        "The questions are straightforward and easy to answer.",
        "This requires detailed medical history and is complex."
    ],  # Reference reasoning
    "reasoning_prediction": [
        "The questionnaire appears simple with no detailed medical requirements.",
        "It includes many questions about past medical history, suggesting complexity.",
        "Straightforward questions make this easy to complete.",
        "Requires extensive medical history, making it complex."
    ]  # Predicted reasoning
}

df = pd.DataFrame(data)

df


Unnamed: 0,difficulty,difficulty_prediction,reasoning,reasoning_prediction
0,1,1,The questionnaire is simple and does not requi...,The questionnaire appears simple with no detai...
1,2,2,The questionnaire involves multiple questions ...,It includes many questions about past medical ...
2,1,1,The questions are straightforward and easy to ...,Straightforward questions make this easy to co...
3,3,2,This requires detailed medical history and is ...,"Requires extensive medical history, making it ..."


In [61]:
# Convert the 'difficulty' and 'difficulty_prediction' columns to integers
responses_df['difficulty'] = responses_df['difficulty'].astype(int)
responses_df['difficulty_prediction'] = responses_df['difficulty_prediction'].astype(int)

# Now, you can run your evaluation function
result = evaluate_difficulty_and_reasoning(responses_df, 'gpt-4o-mini')

print(result)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

{'difficulty_metrics': {'accuracy': 0.6, 'f1': 0.75, 'precision': 1.0, 'recall': 0.6, 'confusion_matrix': [[3, 2], [0, 0]]}, 'reasoning_metrics': {'bert_score': {'precision': 0.9034354090690613, 'recall': 0.8661732316017151, 'f1': 0.8844095349311829}, 'meteor_score': 0.2774130045760559}, 'llm_judge_scores': {'mean_score': 2.3, 'scores': [2.0, 2.0, 1.5, 4.0, 2.0]}}
