# A Programmatic Evaluation of Large Language Models for Clinical Decision Support in Heart Failure and Cardiomyopathy Management


---


In [144]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from chatlas import ChatOpenAI, ChatGoogle, ChatAnthropic, ChatPerplexity, ChatOllama, ChatGithub
from openai import OpenAI  # Added for Deepseek API access
from tqdm import tqdm
from datetime import datetime
import pickle

In [145]:
# parameters
n_queries = 3  # number of responses per model per question to assess consistency and variability

## Load API Keys


In [146]:
# Load environment variables from .env file
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")
perplexity_api_key = os.getenv("PERPLEXITY_API_KEY")
github_pat = os.getenv("GITHUB_PAT")

## Custom DeepSeek Class

In [147]:
# Custom class for Deepseek API integration

# Create a response object compatible with other Chatlas models having the 'content' instance + raw_response in addition
class ResponseWrapper:
    def __init__(self, content, raw_response):
        self.content = content
        self.raw_response = raw_response


class DeepseekChat:
    def __init__(self, model, system_prompt, api_key):
        self.model = model
        self.system_prompt = system_prompt
        self.client = OpenAI(
            api_key=api_key, base_url="https://api.deepseek.com")

    def chat(self, user_input, echo=None):
        # Create a response object similar to what Chatlas expects
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": self.system_prompt},
                {"role": "user", "content": user_input}
            ],
            stream=False
        )

        return ResponseWrapper(response.choices[0].message.content, response)

## Load Questions from CSV


In [148]:
questions_df = pd.read_csv("./data/questions/FAQ_HF_CMP_Patient_20250519.csv")
questions_df = questions_df[questions_df['select'] == 'yes']
questions_df

Unnamed: 0,index,category,subcategory1,subcategory2,question,select,expected_answer,comment
0,1,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
1,2,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
2,3,Disease Understanding and Diagnosis,Symptoms,Heart Failure,I was diagnosed with heart failure. What are t...,yes,"Shortness of breath, fatigue, swelling in legs...",Failure to recognize and report new or worseni...
3,4,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Diagnostic Tests,What tests are needed to diagnose cardiomyopathy?,yes,"Diagnosis involves ECG, Holter ECG, echocardio...",understand disease
4,5,Disease Understanding and Diagnosis,Genetic Considerations,Genetic Testing,How can genetic testing help in cardiomyopathy?,yes,Identify pathogenic variants that lead to dise...,"familial screening, risk stratification"
5,6,Disease Understanding and Diagnosis,Diagnosis & Monitoring,HFpEF,"I have a heart failure diagnosis, but my eject...",yes,HFpEF is characterized by normal ejection frac...,
6,7,Disease Understanding and Diagnosis,Prognosis,Cardiomyopathy,Is cardiomyopathy reversibel?,yes,Many cardiomyopathies represent chronic diseas...,
7,8,Disease Understanding and Diagnosis,Risk & Prevention,Sudden Cardiac Death,What is sudden cardiac death risk in hypertrop...,yes,SCD is one of the most feared complications of...,guides ICD indication
8,9,Disease Understanding and Diagnosis,Prognosis,Heart Function,"Will my heart function improve over time, or w...",yes,"This depends on your specific condition, cause...","Unrealistic expectations, potential disappoint..."
9,10,Disease Understanding and Diagnosis,Symptoms,Weight Gain & Edema,I gained five kilos in the last two weeks and ...,yes,Urgent medical attention must be advised.,Decompensation possible


## Define Function to Query APIs

In [149]:
def initialize_model(model_name):
    """
    Helper function to initialize a model based on the name and environment variable.
    Returns a model instance or raises an informative error.
    """
    system_prompt = ""  # simulating raw user access

    if model_name == 'openai':
        api_key = os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing OPENAI_API_KEY")
        # https://platform.openai.com/docs/models
        return ChatOpenAI(model="gpt-4o-2024-11-20", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'deepseek':
        api_key = os.getenv('DEEPSEEK_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing DEEPSEEK_API_KEY")
        # Use custom DeepseekChat class with OpenAI client
        return DeepseekChat(model="deepseek-chat", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'google':
        api_key = os.getenv('GOOGLE_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing GOOGLE_API_KEY")
        # https://aistudio.google.com/
        return ChatGoogle(model="gemini-2.5-pro-preview-05-06", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'anthropic':
        api_key = os.getenv('ANTHROPIC_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing ANTHROPIC_API_KEY")
        # https://aistudio.google.com/
        return ChatAnthropic(model="claude-3-7-sonnet-20250219", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'perplexity':
        api_key = os.getenv('PERPLEXITY_API_KEY')
        if not api_key:
            raise EnvironmentError("Missing PERPLEXITY_API_KEY")
        # ["sonar", "sonar-pro", "r1-1776"], https://docs.perplexity.ai/models/model-cards
        return ChatPerplexity(model="sonar-pro", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'github':
        api_key = os.getenv('GITHUB_PAT')
        if not api_key:
            raise EnvironmentError("Missing GITHUB_PAT")
        # https://aistudio.google.com/
        return ChatGithub(model="grok-3", system_prompt=system_prompt, api_key=api_key)

    elif model_name == 'ollama':
        # ollama list
        return ChatOllama(model="deepseek-r1:14b", system_prompt=system_prompt)

    else:
        raise ValueError(f"Unsupported model: {model_name}")

In [150]:
def query_models_chatlas(questions_df, models_to_run=['openai', 'anthropic', 'google', 'deepseek', 'perplexity', 'github', 'ollama'], store_full_response=True):
    """
    Query selected models using Chatlas and return a DataFrame with responses.

    Args:
        questions_df (pd.DataFrame): Must have 'index', 'category', 'question'. Optional: 'query_id'.
        models_to_run (list): Models to query. Options: 'openai', 'google', 'perplexity', 'github', 'ollama'.
        store_full_response (bool): If True, store the complete response object, not just content.

    Returns:
        pd.DataFrame: Responses from each model for each question.
        dict: Raw response objects (if store_full_response=True)
    """
    responses = []
    raw_responses = {} if store_full_response else None

    # Initialize raw_responses dictionary if storing full responses
    if store_full_response:
        for idx in questions_df['index'].unique():
            raw_responses[idx] = {}
            # Check if questions_df has query_id column
            if 'query_id' in questions_df.columns:
                # Get all query_ids for this index
                query_ids = questions_df[questions_df['index']
                                         == idx]['query_id'].unique()
                for qid in query_ids:
                    raw_responses[idx][qid] = {}
                    for model in models_to_run:
                        raw_responses[idx][qid][model] = None
            else:
                # Use default query_id 0 if not in dataframe
                raw_responses[idx][0] = {}
                for model in models_to_run:
                    raw_responses[idx][0][model] = None

    # Initialize selected models
    model_instances = {}
    for model in models_to_run:
        try:
            model_instances[model] = initialize_model(model)
        except Exception as e:
            print(f"[Warning] Failed to initialize {model}: {e}")
            model_instances[model] = None

    # Query each model
    for _, row in tqdm(questions_df.iterrows(), total=len(questions_df), desc="Querying models"):
        question, category, subcategory1, subcategory2 = row['question'], row[
            'category'], row['subcategory1'], row['subcategory2']
        # Create response entry with question metadata
        response_entry = {
            'index': row['index'],
            'category': category,
            'subcategory1': subcategory1,
            'subcategory2': subcategory2,
            'question': question
        }

        # Add query_id if it exists in the dataframe
        query_id = row['query_id'] if 'query_id' in row else 0
        response_entry['query_id'] = query_id

        for model_key in models_to_run:
            instance = model_instances.get(model_key)
            try:
                if instance:
                    response = instance.chat(question, echo="none")
                    response_entry[f'{model_key}_response'] = response.content

                    # Store the full response object
                    if store_full_response:
                        raw_responses[row['index']
                                      ][query_id][model_key] = response
                else:
                    response_entry[f'{model_key}_response'] = "Initialization failed"
            except Exception as e:
                response_entry[f'{model_key}_response'] = f"Error: {str(e)}"

        responses.append(response_entry)

    result_df = pd.DataFrame(responses)

    if store_full_response:
        return result_df, raw_responses
    else:
        return result_df

## Quick Test of custom class

In [130]:
responses_df, raw_responses = query_models_chatlas(
    questions_df=questions_df.iloc[0:2],
    models_to_run=['deepseek'],
    store_full_response=True
)

Querying models:   0%|          | 0/2 [00:00<?, ?it/s]

Querying models: 100%|██████████| 2/2 [00:38<00:00, 19.05s/it]


In [None]:
# custom instance DeepSeekChat
raw_responses[np.int64(1)][0]['deepseek'].raw_response.usage

CompletionUsage(completion_tokens=381, prompt_tokens=8, total_tokens=389, completion_tokens_details=None, prompt_tokens_details=PromptTokensDetails(audio_tokens=None, cached_tokens=0), prompt_cache_hit_tokens=0, prompt_cache_miss_tokens=8)

In [154]:
responses_df

Unnamed: 0,index,category,subcategory1,subcategory2,question,query_id,deepseek_response
0,1,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,0,**Heart failure** is a chronic condition where...
1,2,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",0,Your **ejection fraction (EF)** is a measureme...


## Run Queries and Save Responses


In [None]:
responses_df, raw_responses = query_models_chatlas(
    questions_df=questions_df,
    models_to_run=['openai', 'anthropic', 'google',
                   'deepseek', 'perplexity', 'github', 'ollama'],
    store_full_response=True
)  # ['openai', 'anthropic', 'google', 'deepseek', 'perplexity', 'github', 'ollama']

# Save raw responses to pickle file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"./outputs/chats/raw_responses_{timestamp}.pkl", "wb") as f:
    pickle.dump(raw_responses, f)

print(f"Raw responses saved to ./outputs/chats/raw_responses_{timestamp}.pkl")

Querying models: 100%|██████████| 3/3 [02:26<00:00, 48.97s/it]


In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# output file path
output_file = f"./outputs/chats/responses_{timestamp}.csv"
responses_df.to_csv(output_file, index=False)

print(f"Responses saved to {output_file}")

Responses saved to ./outputs/responses_20250513_111615.csv


## Repeat Questions for Statistical Analysis

In [94]:
# Repeat each question n_queries times
questions_df_rep = questions_df.loc[questions_df.index.repeat(
    n_queries)].reset_index(drop=True)

# Add query_id column to differentiate between repetitions
questions_df_rep['query_id'] = questions_df_rep.groupby('index').cumcount() + 1

# Reorder columns to place 'query_id' after 'index'
cols = questions_df_rep.columns.tolist()
cols.insert(1, cols.pop(cols.index('query_id')))
questions_df_rep = questions_df_rep[cols]
questions_df_rep

Unnamed: 0,index,query_id,category,subcategory1,subcategory2,question,select,expected_answer,comment
0,1,1,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
1,1,2,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
2,1,3,Disease Understanding and Diagnosis,General Information,Heart Failure,What is heart failure?,yes,Heart failure is not a single disease but a ‘s...,https://cdn-endpoint-hfm.azureedge.net/heartfa...
3,2,1,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
4,2,2,Disease Understanding and Diagnosis,Diagnosis & Monitoring,Ejection Fraction,"What's my ejection fraction, and what does it ...",yes,Ejection fraction measures how much blood your...,avoid misinterpretation
...,...,...,...,...,...,...,...,...,...
145,49,2,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Alcohol,Can I drink alcohol with cardiomyopathy?,yes,General recommendation: Avoid excess alcohol i...,ESC CMP GL 2023 p.3586
146,49,3,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Alcohol,Can I drink alcohol with cardiomyopathy?,yes,General recommendation: Avoid excess alcohol i...,ESC CMP GL 2023 p.3586
147,50,1,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Caffeine,Can I drink caffeine with cardiomyopathy?,yes,The guidelines do not list caffeine as a contr...,
148,50,2,Lifestyle & Daily Activity,Lifestyle & Daily Activity,Caffeine,Can I drink caffeine with cardiomyopathy?,yes,The guidelines do not list caffeine as a contr...,
