In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report, fbeta_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

### Data Import

In [None]:

# For Google Colaboratory
import sys, os
if 'google.colab' in sys.modules:
    # mount google drive
    from google.colab import drive
    drive.mount('/content/gdrive')
    path_to_file = '/content/gdrive/My Drive/bt5153group'
    print(path_to_file)
    # move to Google Drive directory
    os.chdir(path_to_file)
    !pwd

Mounted at /content/gdrive
/content/gdrive/My Drive/bt5153group
/content/gdrive/My Drive/bt5153group


In [None]:

# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


### Prediction Quality

In [None]:
pred_df = pd.read_csv('/content/drive/My Drive/BT5153_2025/Group Project/prediction_quality.csv')
test_pred_df = pred_df[pred_df['train_test']=='test']
test_pred_df.head()

Unnamed: 0,order_id,train_test,ground_truth,unsloth_prediction_without_monograph,unsloth_prediction_with_monograph,unsloth_prediction_trained_temp_1,unsloth_prediction_trained_temp_0.5,Qwen0.5b_RAG,GPT3.5_RAG,GPT4o_RAG
1,2,test,unsafe,safe,safe,unsafe,unsafe,safe,unsafe,unsafe
3,4,test,safe,unsafe,safe,safe,safe,safe,unsafe,unsafe
5,6,test,unsafe,safe,safe,unsafe,unsafe,safe,unsafe,unsafe
7,8,test,safe,safe,safe,unsafe,unsafe,safe,unsafe,unsafe
9,10,test,unsafe,safe,unsafe,unsafe,unsafe,safe,unsafe,unsafe


In [None]:
def get_prediction_quality(data,prediction,alpha=0.7):
    # Map 'safe' to 1 and 'unsafe' to 0
    ground_truth = data['ground_truth'].map({'safe': 0, 'unsafe': 1})
    predictions = data[prediction].map({'safe': 0, 'unsafe': 1})
    recall_unsafe = recall_score(ground_truth, predictions, pos_label=1)
    tn, fp, fn, tp = confusion_matrix(ground_truth, predictions).ravel()
    specificity = tn / (tn + fp)
    weighted_recall_specificity = alpha * recall_unsafe + (1 - alpha) * specificity

    return recall_unsafe, specificity, weighted_recall_specificity

In [None]:
recall_unsafe_1, specificity_1, weighted_recall_specificity_1 = get_prediction_quality(data=test_pred_df, prediction = 'unsloth_prediction_without_monograph')
recall_unsafe_2, specificity_2, weighted_recall_specificity_2 = get_prediction_quality(data=test_pred_df, prediction = 'unsloth_prediction_with_monograph')
recall_unsafe_3, specificity_3, weighted_recall_specificity_3 = get_prediction_quality(data=test_pred_df, prediction = 'unsloth_prediction_trained_temp_1')
recall_unsafe_4, specificity_4, weighted_recall_specificity_4  = get_prediction_quality(data=test_pred_df, prediction = 'unsloth_prediction_trained_temp_0.5')
recall_unsafe_5, specificity_5, weighted_recall_specificity_5  = get_prediction_quality(data=test_pred_df, prediction = 'Qwen0.5b_RAG')
recall_unsafe_6, specificity_6, weighted_recall_specificity_6  = get_prediction_quality(data=test_pred_df, prediction = 'GPT3.5_RAG')
recall_unsafe_7, specificity_7, weighted_recall_specificity_7  = get_prediction_quality(data=test_pred_df, prediction = 'GPT4o_RAG')

# Print results
print('Prediction Quality for base model without monography information')
print(f"Recall: {recall_unsafe_1:.4f}")
print(f"Specificity: {specificity_1:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_1:.4f}")
print('==================================================')
print('Prediction Quality for base model WITH monography information')
print(f"Recall: {recall_unsafe_2:.4f}")
print(f"Specificity: {specificity_2:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_2:.4f}")
print('==================================================')
print('Prediction Quality for fine-tuned model with temperature = 1')
print(f"Recall: {recall_unsafe_3:.4f}")
print(f"Specificity: {specificity_3:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_3:.4f}")
print('==================================================')
print('Prediction Quality for fine-tuned model with temperature = 0.5')
print(f"Recall: {recall_unsafe_4:.4f}")
print(f"Specificity: {specificity_4:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_4:.4f}")
print('==================================================')
print('Prediction Quality for Qwen0.5b_RAG')
print(f"Recall: {recall_unsafe_5:.4f}")
print(f"Specificity: {specificity_5:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_5:.4f}")
print('==================================================')
print('Prediction Quality for GPT3.5_RAG')
print(f"Recall: {recall_unsafe_6:.4f}")
print(f"Specificity: {specificity_6:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_6:.4f}")
print('==================================================')
print('Prediction Quality for GPT4o_RAG')
print(f"Recall: {recall_unsafe_7:.4f}")
print(f"Specificity: {specificity_7:.4f}")
print(f"Weighted Scoring: {weighted_recall_specificity_7:.4f}")

Prediction Quality for base model without monography information
Recall: 0.2308
Specificity: 0.5833
Weighted Scoring: 0.3365
Prediction Quality for base model WITH monography information
Recall: 0.4615
Specificity: 0.8333
Weighted Scoring: 0.5731
Prediction Quality for fine-tuned model with temperature = 1
Recall: 0.6923
Specificity: 0.6667
Weighted Scoring: 0.6846
Prediction Quality for fine-tuned model with temperature = 0.5
Recall: 0.6923
Specificity: 0.7500
Weighted Scoring: 0.7096
Prediction Quality for Qwen0.5b_RAG
Recall: 0.0000
Specificity: 1.0000
Weighted Scoring: 0.3000
Prediction Quality for GPT3.5_RAG
Recall: 0.7692
Specificity: 0.5000
Weighted Scoring: 0.6885
Prediction Quality for GPT4o_RAG
Recall: 0.6154
Specificity: 0.5833
Weighted Scoring: 0.6058


### Justification Quality

#### Cosine Similarity

In [None]:
reason_df = pd.read_csv('/content/drive/My Drive/BT5153_2025/Group Project/reasoning_quality.csv')
test_reason_df = reason_df[reason_df['train_test']=='test']
test_reason_df.head()

Unnamed: 0,order_id,train_test,ground_truth_reason,unsloth_prediction_without_monograph,unsloth_prediction_with_monograph,unsloth_prediction_trained_temp_1,unsloth_prediction_trained_temp_0.5,Qwen0.5_RAG,GPT3.5_RAG,GPT4o_RAG
1,2,test,This order is not safe because symptoms of Par...,The medication order for oral metoclopramide 1...,The medication order for metoclopramide at a d...,The medication order for metoclopramide is not...,The medication order for metoclopramide is not...,\n\nPatient Profile:\nPatient is a 65 years ol...,"Based on the patient profile provided, the med...",The medication order for metoclopramide 10 mg ...
3,4,test,This order is safe because it aligns with the ...,The medication order for oral metoclopramide 1...,The doctor's order of metoclopramide 10mg thre...,The medication order for metoclopramide 10mg t...,The medication order for metoclopramide is saf...,\n\nPatient Profile:\nPatient is a 30 years ol...,"Based on the patient profile provided, the med...",The medication order for metoclopramide 10 mg ...
5,6,test,This order is not safe because use of nonstero...,The medication order for ibuprofen 200mg three...,The doctor's order of oral ibuprofen 200 mg th...,The medication order for oral ibuprofen 200mg ...,The medication order for ibuprofen 200 mg thre...,\n\nPatient Profile:\nPatient is a 61 years ol...,"Based on the patient profile provided, the med...",The medication order for ibuprofen 200 mg thre...
7,8,test,This order is safe because the dose of 1 table...,The medication order for ibuprofen is consider...,The doctor's order for oral ibuprofen 1 tablet...,The medication order for ibuprofen 200mg three...,The medication order is unsafe for the patient...,\n\nPatient Profile:\nPatient is a 55 years ol...,"Based on the patient profile provided, the ord...",The safety of the ibuprofen order for this pat...
9,10,test,This order is not safe because his eGFR is 40...,The medication order for oral metformin 500mg ...,The medication order for metformin is not safe...,The medication order of oral metformin 500mg t...,The medication order for metformin is unsafe f...,\n\nPatient Profile:\nPatient is a 46 years ol...,"Based on the patient profile provided, the med...",The medication order for metformin 500 mg twic...


In [None]:
def get_justification_quality_with_transformer(data,generated_reason):
    # Load pre-trained model
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

    similarities = []
    for index, row in data.iterrows():
        # Get the ground truth and predicted reasons
        ground_truth = row['ground_truth_reason']
        prediction = row[generated_reason]

        # Create embeddings for both texts
        embeddings = model.encode([ground_truth, prediction])

        # Compute cosine similarity between the two embeddings
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])
        similarities.append(similarity[0][0])

    # Calculate the average cosine similarity score
    avg_similarity = np.mean(similarities)

    return avg_similarity, similarities

In [None]:
# Apply the function
avg_similarity_1, similarities_1 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'unsloth_prediction_without_monograph')
avg_similarity_2, similarities_2 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'unsloth_prediction_with_monograph')
avg_similarity_3, similarities_3 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'unsloth_prediction_trained_temp_1')
avg_similarity_4, similarities_4 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'unsloth_prediction_trained_temp_0.5')
avg_similarity_5, similarities_5 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'Qwen0.5_RAG')
avg_similarity_6, similarities_6 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'GPT3.5_RAG')
avg_similarity_7, similarities_7 = get_justification_quality_with_transformer(data=test_reason_df, generated_reason = 'GPT4o_RAG')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Display the average cosine similarity results
print(f"Average Cosine Similarity for base model without monography information: {avg_similarity_1:.4f}")
print(f"Average Cosine Similarity for base model WITH monography information: {avg_similarity_2:.4f}")
print(f"Average Cosine Similarity for for fine-tuned model with temperature=1: {avg_similarity_3:.4f}")
print(f"Average Cosine Similarity for for fine-tuned model with temperature=0.5: {avg_similarity_4:.4f}")
print(f"Average Cosine Similarity for Qwen0.5_RAG: {avg_similarity_5:.4f}")
print(f"Average Cosine Similarity for GPT3.5_RAG: {avg_similarity_6:.4f}")
print(f"Average Cosine Similarity for GPT4o_RAG: {avg_similarity_7:.4f}")

#Display the similarity table
# Create the final DataFrame with all similarities
similarity_df = pd.DataFrame({
    'order_id': test_reason_df.index + 1,
    'unsloth_prediction_without_monograph': similarities_1,
    'unsloth_prediction_with_monograph': similarities_2,
    'unsloth_prediction_trained_temp_1': similarities_3,
    'unsloth_prediction_trained_temp_0.5': similarities_4,
    'Qwen0.5_RAG': similarities_5,
    'GPT3.5_RAG': similarities_6,
    'GPT4o_RAG': similarities_7
    })

similarity_df

Average Cosine Similarity for base model without monography information: 0.7199
Average Cosine Similarity for base model WITH monography information: 0.7639
Average Cosine Similarity for for fine-tuned model with temperature=1: 0.7713
Average Cosine Similarity for for fine-tuned model with temperature=0.5: 0.7319
Average Cosine Similarity for Qwen0.5_RAG: 0.5957
Average Cosine Similarity for GPT3.5_RAG: 0.7376
Average Cosine Similarity for GPT4o_RAG: 0.7337


Unnamed: 0,order_id,unsloth_prediction_without_monograph,unsloth_prediction_with_monograph,unsloth_prediction_trained_temp_1,unsloth_prediction_trained_temp_0.5,Qwen0.5_RAG,GPT3.5_RAG,GPT4o_RAG
0,2,0.86416,0.804937,0.804936,0.861458,0.632815,0.755748,0.811604
1,4,0.726007,0.706335,0.875586,0.703498,0.516174,0.753623,0.819674
2,6,0.765534,0.797457,0.728261,0.868502,0.589112,0.824703,0.814692
3,8,0.664631,0.846382,0.777838,0.596554,0.566978,0.69018,0.635966
4,10,0.673196,0.761991,0.845604,0.859887,0.596058,0.784697,0.848969
5,12,0.695301,0.768083,0.766734,0.696186,0.543551,0.583757,0.634346
6,14,0.741492,0.851725,0.77655,0.772741,0.603415,0.741835,0.757581
7,16,0.815833,0.792701,0.789112,0.848513,0.565631,0.792875,0.778806
8,18,0.808454,0.926972,0.869483,0.833574,0.725923,0.9192,0.899985
9,20,0.808985,0.90352,0.885688,0.904433,0.810707,0.880015,0.880277
