# A Programmatic Evaluation of Large Language Models for Clinical Decision Support in Heart Failure and Cardiomyopathy Management


---


In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from chatlas import ChatGoogle, ChatAnthropic, ChatOllama, ChatGithub
from tqdm import tqdm
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading  # github lock
from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
import pickle
import time
from collections import defaultdict  # used in parallel fn

from scripts.utils import (ChatPerplexityDirect, DeepseekChat, OpenAIChat)

In [2]:
# parameters
n_queries = 3  # number of responses per model per question to assess consistency and variability

## Load API Keys


In [3]:
# Load environment variables from .env file
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")
github_pat = os.getenv("GITHUB_PAT")

## Load Questions from CSV


In [4]:
questions_df = pd.read_csv("./data/questions/FAQ_HF_CMP_Patient_20250519.csv")
questions_df = questions_df[questions_df['select'] == 'yes']
questions_df

Unnamed: 0,index,category,subcategory1,subcategory2,question,select,expected_answer,comment
0,1,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
1,2,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
2,3,Disease Understanding and Diagnosis,Symptoms,Heart Failure,I was diagnosed with heart failure. What are t...,yes,"Shortness of breath, fatigue, swelling in legs...",Failure to recognize and report new or worseni...
3,4,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Diagnostic Tests,What tests are needed to diagnose cardiomyopathy?,yes,"Diagnosis involves ECG, Holter ECG, echocardio...",understand disease
4,5,Disease Understanding and Diagnosis,Genetic Considerations,Genetic Testing,How can genetic testing help in cardiomyopathy?,yes,Identify pathogenic variants that lead to dise...,"familial screening, risk stratification"
5,6,Disease Understanding and Diagnosis,Diagnosis & Monitoring,HFpEF,"I have a heart failure diagnosis, but my eject...",yes,HFpEF is characterized by normal ejection frac...,
6,7,Disease Understanding and Diagnosis,Prognosis,Cardiomyopathy,Is cardiomyopathy reversibel?,yes,Many cardiomyopathies represent chronic diseas...,
7,8,Disease Understanding and Diagnosis,Risk & Prevention,Sudden Cardiac Death,What is sudden cardiac death risk in hypertrop...,yes,SCD is one of the most feared complications of...,guides ICD indication
8,9,Disease Understanding and Diagnosis,Prognosis,Heart Function,"Will my heart function improve over time, or w...",yes,"This depends on your specific condition, cause...","Unrealistic expectations, potential disappoint..."
9,10,Disease Understanding and Diagnosis,Symptoms,Weight Gain & Edema,I gained five kilos in the last two weeks and ...,yes,Urgent medical attention must be advised.,Decompensation possible


## Define Function to Query APIs


In [5]:
def initialize_model(model_name):
    """
    Helper function to initialize a model based on the name and environment variable.
    Returns a model instance or raises an informative error.
    """
    system_prompt = ""  # simulating raw user access

    if model_name == 'openai':
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing OPENAI_API_KEY")
        # https://platform.openai.com/docs/models, custom OpenAI class
        return OpenAIChat(model="gpt-4o-2024-11-20", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'deepseek':
        api_key = os.getenv('DEEPSEEK_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing DEEPSEEK_API_KEY")
        # Use custom DeepseekChat class with OpenAI client
        return DeepseekChat(model="deepseek-chat", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'google':
        api_key = os.getenv('GOOGLE_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing GOOGLE_API_KEY")
        # https://aistudio.google.com/
        return ChatGoogle(model="gemini-2.5-pro-preview-05-06", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'anthropic':
        api_key = os.getenv('ANTHROPIC_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing ANTHROPIC_API_KEY")
        # https://aistudio.google.com/
        return ChatAnthropic(model="claude-3-7-sonnet-20250219", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'perplexity':
        api_key = os.getenv('PERPLEXITY_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing PERPLEXITY_API_KEY")
        # ["sonar", "sonar-pro", "r1-1776"], https://docs.perplexity.ai/models/model-cards
        # Use custom Perplexity Class
        return ChatPerplexityDirect(api_key=api_key, model="sonar-pro", system_prompt=system_prompt)

    elif model_name == 'github':
        api_key = os.getenv('GITHUB_PAT')
        if not api_key:
            raise EnvironmentError("Missing GITHUB_PAT")
        # https://aistudio.google.com/
        return ChatGithub(model="grok-3", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'ollama':
        # ollama list
        return ChatOllama(model="deepseek-r1:14b", system_prompt=system_prompt)

    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [6]:
def query_models_chatlas(questions_df, models_to_run=['openai', 'anthropic', 'google', 'deepseek', 'perplexity', 'github', 'ollama'], store_full_response=True):
    """
    Query selected models using Chatlas and return a DataFrame with responses.

    Args:
        questions_df (pd.DataFrame): Must have 'index', 'category', 'question'. Optional: 'query_id'.
        models_to_run (list): Models to query. Options: 'openai', 'google', 'perplexity', 'github', 'ollama'.
        store_full_response (bool): If True, store the complete response object, not just content.

    Returns:
        pd.DataFrame: Responses from each model for each question.
        dict: Raw response objects (if store_full_response=True)
    """
    responses = []
    raw_responses = {} if store_full_response else None

    # Initialize raw_responses dictionary if storing full responses
    if store_full_response:
        for idx in questions_df['index'].unique():
            raw_responses[idx] = {}
            # Check if questions_df has query_id column
            if 'query_id' in questions_df.columns:
                # Get all query_ids for this index
                query_ids = questions_df[questions_df['index']
                                         == idx]['query_id'].unique()
                for qid in query_ids:
                    raw_responses[idx][qid] = {}
                    for model in models_to_run:
                        raw_responses[idx][qid][model] = None
            else:
                # Use default query_id 0 if not in dataframe
                raw_responses[idx][0] = {}
                for model in models_to_run:
                    raw_responses[idx][0][model] = None

    # Initialize selected models
    model_instances = {}
    for model in models_to_run:
        try:
            model_instances[model] = initialize_model(model)
        except Exception as e:
            print(f"[Warning] Failed to initialize {model}: {e}")
            model_instances[model] = None

    # Query each model
    for _, row in tqdm(questions_df.iterrows(), total=len(questions_df), desc="Querying models"):
        question, category, subcategory1, subcategory2 = row['question'], row[
            'category'], row['subcategory1'], row['subcategory2']
        # Create response entry with question metadata
        response_entry = {
            'index': row['index'],
            'category': category,
            'subcategory1': subcategory1,
            'subcategory2': subcategory2,
            'question': question
        }

        # Add query_id if it exists in the dataframe
        query_id = row['query_id'] if 'query_id' in row else 0
        response_entry['query_id'] = query_id

        for model_key in models_to_run:
            instance = model_instances.get(model_key)
            try:
                if instance:
                    response = instance.chat(question, echo="none")
                    response_entry[f'{model_key}_response'] = response.content

                    # Store the full response object
                    if store_full_response:
                        raw_responses[row['index']
                                      ][query_id][model_key] = response
                else:
                    response_entry[f'{model_key}_response'] = "Initialization failed"
            except Exception as e:
                response_entry[f'{model_key}_response'] = f"Error: {str(e)}"

        responses.append(response_entry)

    result_df = pd.DataFrame(responses)

    if store_full_response:
        return result_df, raw_responses
    else:
        return result_df

## Parallel Execution of API Calls

In [7]:
# force GitHub model calls to run one at a time (Rate limit of 1 per 0s exceeded for UserConcurrentRequests)
github_lock = threading.Lock()


@retry(wait=wait_exponential(min=1, max=5), stop=stop_after_attempt(3))
def call_model_with_retry(instance, question):
    return instance.chat(question, echo="none")


def query_single_model(question, model_key, instance):
    try:
        if not instance:
            return model_key, "Initialization failed", None

        # Serialize GitHub requests
        if model_key == "github":
            with github_lock:
                response = call_model_with_retry(instance, question)
        else:
            response = call_model_with_retry(instance, question)

        return model_key, response.content, response

    except Exception as e:
        return model_key, f"Error: {str(e)}", None


def query_models_parallel(questions_df, models_to_run, store_full_response=True):
    responses_dict = defaultdict(dict)
    raw_responses = {} if store_full_response else None

    # Initialize models
    model_instances = {m: initialize_model(m) for m in models_to_run}

    futures = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        for _, row in questions_df.iterrows():
            question = row['question']
            query_id = row.get('query_id', 0)
            idx = row['index']

            # Unique key for each question and query_id
            entry_key = (idx, query_id)

            # Initialize the entry only once per unique (index, query_id)
            if 'meta_initialized' not in responses_dict[entry_key]:
                responses_dict[entry_key].update({
                    'index': idx,
                    'question': question,
                    'category': row['category'],
                    'subcategory1': row['subcategory1'],
                    'subcategory2': row['subcategory2'],
                    'query_id': query_id,
                    'meta_initialized': True  # prevents duplicate updates
                })

            if store_full_response:
                raw_responses.setdefault(idx, {}).setdefault(query_id, {})

            for model_key in models_to_run:
                instance = model_instances.get(model_key)
                futures.append((
                    executor.submit(query_single_model,
                                    question, model_key, instance),
                    entry_key, model_key, idx, query_id
                ))

        for future, entry_key, model_key, idx, query_id in tqdm(futures, desc="Parallel querying"):
            model, content, full_response = future.result()

            # Add model response to correct row
            responses_dict[entry_key][f"{model_key}_response"] = content

            if store_full_response:
                raw_responses[idx][query_id][model_key] = full_response

    # Remove the helper key and turn into DataFrame
    for entry in responses_dict.values():
        entry.pop('meta_initialized', None)

    result_df = pd.DataFrame(responses_dict.values())
    return (result_df, raw_responses) if store_full_response else result_df

## Quick Test of custom class


In [11]:
responses_df, raw_responses = query_models_chatlas(
    questions_df=questions_df.iloc[0:2],
    models_to_run=['openai'],
    store_full_response=True
)

Querying models: 100%|██████████| 2/2 [00:19<00:00,  9.67s/it]


In [8]:
responses_df_parallel, raw_responses_parallel = query_models_parallel(
    questions_df=questions_df.iloc[0:2],
    models_to_run=['openai', 'github', 'perplexity',
                   'google', 'anthropic', 'deepseek'],
    store_full_response=True
)

Parallel querying: 100%|██████████| 12/12 [04:14<00:00, 21.19s/it]


In [None]:
# custom instance DeepSeekChat
print(raw_responses_parallel[1][0]['openai'].raw_response.usage)
print(raw_responses_parallel[1][0]['github'])

ResponseUsage(input_tokens=12, input_tokens_details=InputTokensDetails(cached_tokens=0), output_tokens=761, output_tokens_details=OutputTokensDetails(reasoning_tokens=0), total_tokens=773)
Heart failure, also known as congestive heart failure (CHF), is a chronic medical condition in which the heart is unable to pump blood effectively to meet the body's needs for oxygen and nutrients. It does not mean that the heart has stopped working, but rather that it is not functioning as efficiently as it should. Heart failure can affect one or both sides of the heart and can develop over time due to various underlying causes.

### Key Aspects of Heart Failure:
1. **Types of Heart Failure**:
   - **Left-sided heart failure**: The left side of the heart struggles to pump oxygen-rich blood to the body, often leading to fluid buildup in the lungs (pulmonary edema), causing shortness of breath.
   - **Right-sided heart failure**: The right side of the heart fails to pump blood effectively to the lungs

In [10]:
responses_df_parallel

Unnamed: 0,index,question,category,subcategory1,subcategory2,query_id,openai_response,github_response,perplexity_response,google_response,anthropic_response,deepseek_response
0,1,What is heart failure?,Disease Understanding and Diagnosis,General Information,Heart Failure,0,Heart failure is a serious medical condition i...,"Heart failure, also known as congestive heart ...",Heart failure is a condition that occurs when ...,"Okay, let's break down what heart failure is i...",# Heart Failure\n\nHeart failure is a chronic ...,**Heart failure** is a chronic condition where...
1,2,"What's my ejection fraction, and what does it ...",Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,0,Your ejection fraction (EF) is a measure of ho...,I'm unable to tell you what your specific ejec...,## Ejection Fraction: Definition and Meaning\n...,"Okay, this is an important question, but **I c...",I don't have access to your personal medical i...,Your **ejection fraction (EF)** is a measureme...


## Run Queries and Save Responses


In [24]:
questions_df

Unnamed: 0,index,category,subcategory1,subcategory2,question,select,expected_answer,comment
0,1,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
1,2,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
2,3,Disease Understanding and Diagnosis,Symptoms,Heart Failure,I was diagnosed with heart failure. What are t...,yes,"Shortness of breath, fatigue, swelling in legs...",Failure to recognize and report new or worseni...
3,4,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Diagnostic Tests,What tests are needed to diagnose cardiomyopathy?,yes,"Diagnosis involves ECG, Holter ECG, echocardio...",understand disease
4,5,Disease Understanding and Diagnosis,Genetic Considerations,Genetic Testing,How can genetic testing help in cardiomyopathy?,yes,Identify pathogenic variants that lead to dise...,"familial screening, risk stratification"
5,6,Disease Understanding and Diagnosis,Diagnosis & Monitoring,HFpEF,"I have a heart failure diagnosis, but my eject...",yes,HFpEF is characterized by normal ejection frac...,
6,7,Disease Understanding and Diagnosis,Prognosis,Cardiomyopathy,Is cardiomyopathy reversibel?,yes,Many cardiomyopathies represent chronic diseas...,
7,8,Disease Understanding and Diagnosis,Risk & Prevention,Sudden Cardiac Death,What is sudden cardiac death risk in hypertrop...,yes,SCD is one of the most feared complications of...,guides ICD indication
8,9,Disease Understanding and Diagnosis,Prognosis,Heart Function,"Will my heart function improve over time, or w...",yes,"This depends on your specific condition, cause...","Unrealistic expectations, potential disappoint..."
9,10,Disease Understanding and Diagnosis,Symptoms,Weight Gain & Edema,I gained five kilos in the last two weeks and ...,yes,Urgent medical attention must be advised.,Decompensation possible


In [None]:
start_time = time.time()

responses_df, raw_responses = query_models_chatlas(
    questions_df=questions_df,
    models_to_run=['openai', 'anthropic', 'google',
                   'deepseek', 'perplexity', 'github'],
    store_full_response=True
)  # ['openai', 'anthropic', 'google', 'deepseek', 'perplexity', 'github', 'ollama']

execution_time = time.time() - start_time

Querying models:   0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# Calculate execution time
execution_minutes = execution_time / 60
execution_hours = execution_minutes / 60

if execution_hours >= 1:
    print(
        f"Query execution completed in {execution_time:.2f} seconds ({execution_minutes:.2f} minutes, {execution_hours:.2f} hours)")
else:
    print(
        f"Query execution completed in {execution_time:.2f} seconds ({execution_minutes:.2f} minutes)")

# Save raw responses to pickle file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"./outputs/chats/raw_responses_{timestamp}.pkl", "wb") as f:
    pickle.dump(raw_responses, f)

print(f"Raw responses saved to ./outputs/chats/raw_responses_{timestamp}.pkl")

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# output file path
output_file = f"./outputs/chats/responses_{timestamp}.csv"
responses_df.to_csv(output_file, index=False)

print(f"Responses saved to {output_file}")

Responses saved to ./outputs/responses_20250513_111615.csv


## Repeat Questions for Statistical Analysis


In [94]:
# Repeat each question n_queries times
questions_df_rep = questions_df.loc[questions_df.index.repeat(
    n_queries)].reset_index(drop=True)

# Add query_id column to differentiate between repetitions
questions_df_rep['query_id'] = questions_df_rep.groupby('index').cumcount() + 1

# Reorder columns to place 'query_id' after 'index'
cols = questions_df_rep.columns.tolist()
cols.insert(1, cols.pop(cols.index('query_id')))
questions_df_rep = questions_df_rep[cols]
questions_df_rep

Unnamed: 0,index,query_id,category,subcategory1,subcategory2,question,select,expected_answer,comment
0,1,1,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
1,1,2,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
2,1,3,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
3,2,1,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
4,2,2,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
...,...,...,...,...,...,...,...,...,...
145,49,2,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Alcohol,Can I drink alcohol with cardiomyopathy?,yes,General recommendation: Avoid excess alcohol i...,ESC CMP GL 2023 p.3586
146,49,3,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Alcohol,Can I drink alcohol with cardiomyopathy?,yes,General recommendation: Avoid excess alcohol i...,ESC CMP GL 2023 p.3586
147,50,1,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Caffeine,Can I drink caffeine with cardiomyopathy?,yes,The guidelines do not list caffeine as a contr...,
148,50,2,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Caffeine,Can I drink caffeine with cardiomyopathy?,yes,The guidelines do not list caffeine as a contr...,
