# Objective

This notebook uses the `Most Frequent` baseline to predict and evaluate diagnoses of the last visit on a test set of patient

# Imports

In [6]:
from Mimic import Mimic
from ICDCodesGrouper import ICDCodesGrouper

from Metrics import metrics
from MostFrequent import MostFrequent

from sklearn.model_selection import train_test_split
import pandas as pd
idx = pd.IndexSlice
from tqdm.notebook import tqdm

# Read data

In [7]:
grouper = ICDCodesGrouper()
mimic = Mimic(grouper=grouper)

In [8]:
diagnoses = mimic.read_diagnoses()
diagnoses.head(1)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,ccs,icd9chapters
0,1297,109,172335,1.0,40301,99,7


## Prepare data

- ignore admissions without record of diagnoses
- ignore patients with only 1 admission

In [9]:
admissions = mimic.read_admissions()
diagnoses = mimic.read_diagnoses()

# remove admissions without record of diagnoses
admissions_without_records= diagnoses.loc[pd.isna(diagnoses.ICD9_CODE),'HADM_ID'].unique()
admissions = admissions[~admissions.HADM_ID.isin(admissions_without_records)]
# Only interested in patients with multiple admissions
mult_adm_patients = admissions.groupby('SUBJECT_ID').size().where(lambda x: x > 1).dropna().index.tolist()

# Create test set of patients

In [11]:
random_state=213
test_size=0.25
train_patients,test_patients = train_test_split(mult_adm_patients,
                                                test_size=test_size,
                                                random_state=random_state)

# Predict

In [13]:
metrics_at_k = [1,3,5,10,30]
all_patients_metrics = []

for patient in tqdm(test_patients):
    for coding in grouper.get_available_groupers():
        retrieved,golden = MostFrequent.predict(patient,coding,mimic)
        
        patient_metrics = {'patient':patient,'coding':coding}
        for k in metrics_at_k:
            for metric in metrics.get_metrics():
                patient_metrics[f'{metric}@{k}'] = metrics.compute_metric(metric,golden,retrieved,k)
            
        all_patients_metrics.append(patient_metrics)
results = pd.DataFrame(all_patients_metrics).set_index(['patient','coding']).sort_index()
results.head(5)

  0%|          | 0/1875 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,precision@1,recall@1,precision@3,recall@3,precision@5,recall@5,precision@10,recall@10,precision@30,recall@30
patient,coding,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
21,ccs,1.0,0.05,0.666667,0.1,0.6,0.15,0.5,0.25,0.529412,0.45
21,icd9chapters,1.0,0.083333,1.0,0.25,1.0,0.416667,0.888889,0.666667,0.888889,0.666667
23,ccs,1.0,0.111111,0.666667,0.222222,0.8,0.444444,0.666667,0.444444,0.666667,0.444444
23,icd9chapters,1.0,0.166667,1.0,0.5,0.8,0.666667,0.8,0.666667,0.8,0.666667
36,ccs,1.0,0.090909,1.0,0.272727,1.0,0.454545,0.6,0.545455,0.545455,0.545455


# Evaluate

In [15]:
results.mean(level='coding')

Unnamed: 0_level_0,precision@1,recall@1,precision@3,recall@3,precision@5,recall@5,precision@10,recall@10,precision@30,recall@30
coding,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ccs,0.649067,0.068866,0.594222,0.177693,0.56064,0.265702,0.510543,0.402167,0.465635,0.497686
icd9chapters,0.893333,0.150021,0.8072,0.376934,0.746622,0.519794,0.692715,0.666671,0.683329,0.689073


## Save

In [16]:
filename = 'baseline_most_frequent'
results.mean(level='coding').to_csv(f'results/{filename}.csv')