In [4]:
import pandas as pd

In [8]:
# df_hyena = pd.read_csv('inference_times_hyena.csv')
df_mamba = pd.read_csv('inference_times_mamba.csv')
df_llama = pd.read_csv('inference_times_llama.csv')
df_gpt = pd.read_csv('inference_times_gpt.csv')

df = pd.concat([df_mamba, df_llama, df_gpt])
df['throughput']  = df['batch_size'] * df['num_tokens_to_generate'] / df['inference_time_seconds']
df = df[['model_name', 'context_length', 'mean_inference_time_seconds', 'throughput']]
df

Unnamed: 0,model_name,context_length,mean_inference_time_seconds,throughput
0,mamba,1024,3.466146,12.309543
1,mamba,1024,1.960118,43.534793
2,mamba,1024,1.969450,86.657002
3,mamba,1024,1.987233,171.763103
4,mamba,1024,2.011993,339.298686
...,...,...,...,...
27,gpt2,4096,3.304645,103.288969
28,gpt2,4096,3.590669,190.122421
29,gpt2,4096,4.139411,329.837608
30,gpt2,4096,5.218785,523.237971


In [29]:
import os

# Dictionary mapping folder names to actual task names
folder_to_name_mapping = {
    'guo_los': 'Long LOS',
    'guo_icu': 'ICU Prediction',
    'guo_readmission': '30-Day Readmission',
    'lab_anemia': 'Anemia',
    'lab_hyperkalemia': 'Hyperkalemia',
    'lab_hypoglycemia': 'Hypoglycemia',
    'lab_hyponatremia': 'Hyponatremia',
    'lab_thrombocytopenia': 'Thrombocytopenia',
    'new_acutemi': 'Acute MI',
    'new_celiac': 'Celiac',
    'new_hyperlipidemia': 'Hyperlipidemia',
    'new_hypertension': 'Hypertension',
    'new_lupus': 'Lupus',
    'new_pancan': 'Pancreatic Cancer',
    'chexpert': 'Chexpert'
}


# Function to clean and extract relevant information from the CSV
def clean_and_extract(df):
    # Select and clean relevant columns, including the replicate column and bounds
    df = df[['sub_task', 'model', 'k', 'score', 'value', 'replicate', 'lower', 'upper']]
    df['model'] = df['model'].str.split('--clmbr_train').str[0]
    
    # Filter for AUROC scores
    df = df[(df['k'] == -1) & (df['score'] == 'auroc')]
    
    # Extract architecture and context length
    df['architecture'] = df['model'].apply(lambda x: x.split('-')[0])
    
    # Exclude BERT from the data
    df = df[df['architecture'] != 'bert']
    
    # Extract context length safely and ensure it's an integer
    df['context_length'] = df['model'].apply(lambda x: int(x.split('-')[2]) if len(x.split('-')) > 2 and x.split('-')[2].isdigit() else None)
    
    # Filter out rows related to GPT vocab size variations
    df = df[~df['model'].str.contains('--clmbr-')]
    
    # Group by task, model, architecture, context length, and calculate mean and bounds for replicates
    df_grouped = df.groupby(['sub_task', 'model', 'architecture', 'context_length']).agg(
        value_mean=('value', 'mean'),
        lower_bound=('lower', 'mean'),
        upper_bound=('upper', 'mean')
    ).reset_index()
    
    return df_grouped
    
# Function to process all tasks and return the combined summary
def process_all_tasks_for_plotting():
    ehrshot_dir = '/share/pi/nigam/users/migufuen/ehrshot-benchmark/EHRSHOT_ASSETS/results_ehrshot'  # Point to the 'ehrshot' directory
    combined_summary = pd.DataFrame()
    include_models = [
        'gpt2-base-512--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'gpt2-base-1024--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'gpt2-base-2048--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'gpt2-base-4096--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'hyena-large-1024--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'hyena-large-4096--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'hyena-large-8192--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'hyena-large-16384--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'llama-base-512--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'llama-base-1024--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'llama-base-2048--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'llama-base-4096--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'mamba-tiny-1024--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'mamba-tiny-4096--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'mamba-tiny-8192--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last',
        'mamba-tiny-16384--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
        'mamba-tiny-32768--clmbr_train-tokens-total_nonPAD-ckpt_val=2000000000-persist_chunk:last_embed:last', 
    ]
    
    exclude_tasks = [
        'chexpert'
    ]
    for task_name in os.listdir(ehrshot_dir):
        task_path = os.path.join(ehrshot_dir, task_name)
        if os.path.isdir(task_path) and task_name not in exclude_tasks:
                try:
                    print(f"Processing task: {task_name}")
                    # Read and clean data
                    input_file = os.path.join(task_path, 'all_results.csv')
                    df = pd.read_csv(input_file)
                    df = df[df['model'].isin(include_models)]
                    df_cleaned = clean_and_extract(df)
                    df_cleaned['task_name'] = folder_to_name_mapping.get(task_name, task_name)
                    
                    # Map the task name to a human-readable name
                    if task_name == 'chexpert':
                        df_cleaned = df_cleaned.groupby(['architecture', 'context_length', 'model', 'task_name']).agg({
                            'value_mean': 'mean',
                            'lower_bound': 'mean',
                            'upper_bound': 'mean'
                        }).reset_index()
                        df_cleaned['sub_task'] = df_cleaned['task_name']
                    
                    
                    # Combine with the overall summary
                    combined_summary = pd.concat([combined_summary, df_cleaned], ignore_index=True)
                    # print(combined_summary.head())  # Debug: Inspect the first few rows
                    
                except Exception as e:
                    print(f"Failed to process task {task_name}: {e}")
    return combined_summary

combined_summary = process_all_tasks_for_plotting()

Processing task: new_hypertension
Processing task: guo_los
Processing task: lab_hypoglycemia
Processing task: new_lupus


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be s

Processing task: lab_hyponatremia
Processing task: new_pancan
Processing task: lab_anemia
Processing task: new_acutemi
Processing task: guo_readmission
Processing task: lab_thrombocytopenia
Processing task: new_hyperlipidemia
Processing task: new_celiac


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be s

Processing task: lab_hyperkalemia
Processing task: guo_icu


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['model'] = df['model'].str.split('--clmbr_train').str[0]


In [36]:
df_mean = combined_summary.groupby(['model', 'context_length'])['value_mean'].mean().reset_index()
df_mean['model_name'] = df_mean['model'].apply(lambda x: x.split('-')[0])
df_mean

Unnamed: 0,model,context_length,value_mean,model_name
0,gpt2-base-1024,1024,0.783288,gpt2
1,gpt2-base-2048,2048,0.799709,gpt2
2,gpt2-base-4096,4096,0.79148,gpt2
3,gpt2-base-512,512,0.792351,gpt2
4,hyena-large-1024,1024,0.79438,hyena
5,hyena-large-16384,16384,0.731876,hyena
6,hyena-large-4096,4096,0.796362,hyena
7,hyena-large-8192,8192,0.766145,hyena
8,llama-base-1024,1024,0.789721,llama
9,llama-base-2048,2048,0.800335,llama


In [44]:
df_ = df.groupby(['model_name', 'context_length']).agg({
    'throughput': 'max',
}).reset_index()
model2ehrshot = {
    (row['model_name'], row['context_length']) : row['value_mean']
    for idx, row in df_mean.iterrows()
}
df_['ehrshot'] = df_.apply(lambda row: model2ehrshot.get((row['model_name'], row['context_length']), None), axis=1)
df_

Unnamed: 0,model_name,context_length,throughput,ehrshot
0,gpt2,512,744.518766,0.792351
1,gpt2,1024,743.835151,0.783288
2,gpt2,2048,741.76103,0.799709
3,gpt2,4096,746.84854,0.79148
4,llama,512,1156.818764,0.78899
5,llama,1024,1157.05312,0.789721
6,llama,2048,1791.947023,0.800335
7,llama,4096,1795.433293,0.796923
8,mamba,1024,2205.268081,0.784284
9,mamba,4096,2295.818796,0.80187


In [45]:
print(df_.to_markdown(index=False))

| model_name   |   context_length |   throughput |   ehrshot |
|:-------------|-----------------:|-------------:|----------:|
| gpt2         |              512 |      744.519 |  0.792351 |
| gpt2         |             1024 |      743.835 |  0.783288 |
| gpt2         |             2048 |      741.761 |  0.799709 |
| gpt2         |             4096 |      746.849 |  0.79148  |
| llama        |              512 |     1156.82  |  0.78899  |
| llama        |             1024 |     1157.05  |  0.789721 |
| llama        |             2048 |     1791.95  |  0.800335 |
| llama        |             4096 |     1795.43  |  0.796923 |
| mamba        |             1024 |     2205.27  |  0.784284 |
| mamba        |             4096 |     2295.82  |  0.80187  |
| mamba        |             8192 |     2297.37  |  0.798959 |
| mamba        |            16384 |     2289.14  |  0.807706 |
