dataset code: A301

eligibility:
1. after 2016
2. at least 2 admissions
3. admission is eligible if there is at least 1 recognized ccs code
5. episodes can't have multiple diagnostics assigned with more than 1 day delay between each other

process:
1. ccs codes
2. 12 months target window
3. saves delta_days and date of last admission before prediction_period of all datapoints

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

In [2]:
import pandas as pd
idx = pd.IndexSlice
pd.options.display.max_columns = None

import numpy as np
import os

from datetime import timedelta
import json

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from ICDMappings import ICDMappings
icdmap = ICDMappings()

# Parameters

In [3]:
#where to save the resulting dataset
dataset_filepath = 'data/model_ready_dataset/icare2021_diag_A301/dataset.json'
raw_data_filepath = 'data/model_ready_dataset/icare2021_diag_A301/dataset.csv'

#checks

assert os.path.dirname(dataset_filepath) == os.path.dirname(raw_data_filepath), 'make sure both dataset are saved under the same directory'
assert os.path.isdir(os.path.dirname(dataset_filepath)),'Please create the directory first or try another path to save'

assert not os.path.isfile(dataset_filepath) or not os.path.isfile(raw_data_filepath), 'File exists, are you sure you want to overwrite it? If so, comment this line and run the notebook again'

AssertionError: File exists, are you sure you want to overwrite it? If so, comment this line and run the notebook again

----

In [None]:
class Icare:
    def __init__(self,data_folder):
        self.data_folder = data_folder
        self.diagnoses_path = 'LS_ANALYTICS.ICARE_CLINICO_DIAGNOSTICOS/index.csv'
        self.atividade_path = 'LS_ANALYTICS.ICARE_ATIVIDADE_HOSPITALAR/index.csv'
        
    
    def _read_diagnoses(self):
        print('Reading diagnostics table...')
        diagnoses_file = os.path.join(self.data_folder,self.diagnoses_path)
        
        df = pd.read_csv(diagnoses_file,sep='\t')
        
        print('begining: ',round(df.memory_usage(index=True).sum() / 1_000_000,1),'Mb') #Mbytes
        
        df = df.drop(columns='UNIDADE')
        #print('drop UNIDADE:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        df = df.drop(columns='DATA_FIM')
        #print('drop date_end:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        df.loc[:,'PRIORIDADE_DIAGNOSTICO'] = df.PRIORIDADE_DIAGNOSTICO.astype('category')
        #print('PRIORIDADE_DIAGNOSTICO to category:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        df = df.drop(columns=['ICD9_DESCRICAO'])
        #print('drop icd9_descricao:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        df.loc[:,'DIAGNOSTICO_PRINCIPAL'] = df.DIAGNOSTICO_PRINCIPAL.map({'S':True,'N':False})        
        #print('DIAGNOSTICO_PRINCIPAL to boolean:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        # a single row with the year 9064 ruins pd.to_datetime (overflow since pandas uses miliseconds in dates)
        # tldr:  we will remove that row
        nrows_before = df.shape[0]
        df = df.drop(df.DATA_INICIO.apply(lambda x: x[:4] if x is not np.nan else x).astype(float).where(lambda x: x==9064.0).dropna().index[0])
        nrows_after = df.shape[0]
        assert nrows_before == nrows_after + 1, 'Ooops, expecting to drop exactly 1 row. maybe dataset changed.'
        
        df.loc[:,'DATA_INICIO'] = pd.to_datetime(df.DATA_INICIO,format='%Y-%m-%d %H:%M:%S')
        #print('DATA_INICIO to datetime:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        df = df.drop_duplicates()
        #print('drop duplicates:',df.memory_usage(index=True).sum() / 1_000_000) #Mbytes
        
        # last row is trash
        df = df.iloc[:-1]
        
        print('end: ',round(df.memory_usage(index=True).sum() / 1_000_000),'Mb') #Mbytes
        print('Done')
        return df
        

In [None]:
icare = Icare('../../icare-dataset_2021-08')

In [None]:
df = icare._read_diagnoses()
df.head(1)

In [None]:
res = df.groupby(['EPISODIO','NHC']).DATA_INICIO.agg([min,max])
res.head(3)

In [None]:
res.head(5)

----

# ICD9 to ICD9_3 then CCS

In [None]:
df.loc[:,'ICD9_3'] = df.ICD9.apply(lambda x: x[:3] if x is not np.nan else x)
df.loc[:,'ICD9_3->CCS'] = icdmap.lookup('icd9_3toccs',df['ICD9_3'])

## define eligibility criteria

1. data after 2016
1. patients with at least 2 admissions
1. all admissions must have at least 1 ccs diagnostic that is eligible

## define windows

3,6,12 months

what metrics to keep track of:
1. distribution of #admissions per target window
1. distribution of # diagnoses per target window
1. distribution of # admissions of input per target window
1. distribution # diagnoses of input per target window

In [None]:
# eligibility
df = icare._read_diagnoses()

print('Preparing eligibility filtering...')
df['ICD9_3'] = df.ICD9.apply(lambda x: x[:3] if x is not np.nan else x)
df['ICD9_3->CCS'] = icdmap.lookup('icd9_3toccs',df['ICD9_3'])

## after 2016
df = df.loc[df.DATA_INICIO > '2016-01-01']

## admission is eligible if there is at least 1 recognized ccs code
df['is_ccs_na'] = df['ICD9_3->CCS'].isna()
admissions_without_any_eligible_ccs = df.groupby('EPISODIO')['is_ccs_na'].all().where(lambda x: x == True).dropna().index
df = df.loc[~df.EPISODIO.isin(admissions_without_any_eligible_ccs)]

## drop rows where diagnostic is not recognized
df = df.loc[~df['ICD9_3->CCS'].isna()]

## patient with at least 2 admissions
patients_2_admissions = df.groupby('NHC').EPISODIO.size().where(lambda x: x > 1).dropna().index
df = df.loc[df.NHC.isin(patients_2_admissions)]

## episodes with multiple diagnostics can't have diagnostics assigned with more than 1 day delay between each other
episodes_far_diagnostics = df.groupby('EPISODIO').DATA_INICIO.agg([min,max]).diff(axis=1)['max'].dt.days.where(lambda x: x > 0).dropna().index
df = df.loc[~df.EPISODIO.isin(episodes_far_diagnostics)]
print('Done')

df = df.sort_values('DATA_INICIO')
df.DATA_INICIO = pd.to_datetime(df.DATA_INICIO.dt.date,format='%Y-%m-%d')

df.shape
df.EPISODIO.nunique()
df.NHC.nunique()
df.groupby('NHC').EPISODIO.size().value_counts().rename('Distribution of #episodes per eligible patient')

In [None]:
df.loc[:,'ccs_list'] = df['ICD9_3->CCS'].apply(lambda x: [x])
df_ = df.groupby(['NHC','DATA_INICIO'])[['ccs_list']].sum()

In [None]:
test = df_.iloc[:50_000].copy()

#### 1000 rows
1. 1.68
2. 1.08

#### 10_000 rows
1. 14.4 (8.5x)
2. 8.49 (7.9x)

#### 100_000 rows
1. 189 (112x) (13x)
2. 85 (78x) (10x)

In [None]:
%%time
#assert 1==2,'Prevent myself from running this cell
m = 12 #months
res = (test
 .groupby('NHC')
 .apply(lambda subdf: 
        subdf.assign(target = 
                     subdf.apply(lambda row: 
                                 subdf
                                 .loc[idx[:,
                                         row.name[1]+timedelta(days=1):row.name[1]+timedelta(days=30*m)
                                         ],
                                      'ccs_list'
                                     ]
                                 .sum(),
                                 axis=1
                                ),
                     history = 
                     subdf.apply(lambda row:
                                 subdf
                                 .loc[idx[:,
                                          :row.name[1]+timedelta(days=1)
                                         ],
                                      'ccs_list'
                                     ]
                                 .tolist(),
                                 axis=1
                                ),
                    )
       )
      )

# Add feature: delta days

In [None]:
res['delta_days'] = res.reset_index().groupby('NHC')['DATA_INICIO'].diff().dt.days.fillna(0).values

# Print out some distributions

In [None]:
print(f'Number of non-empty targets out of {res.shape[0]}: {res[res.target != 0].shape[0]}')
print(f'Distribution of target size')
res.loc[res.target != 0,'target'].apply(len).value_counts(normalize=True).iloc[:15]

In [None]:
res.loc[res.target != 0].groupby('NHC').delta_days.agg('median').describe()[['25%','50%','75%']].rename('Quartiles of all patients of the median delta_days of each patient, on eligible datapoints')

In [None]:
res2 = res[res.target != 0].reset_index(1).rename(columns={'DATA_INICIO':'DATA'})

In [None]:
res2.groupby('NHC').size().describe()[['25%','50%','75%']].rename('Quartiles #admissions eligible per patient')

In [None]:
print(f'{res2.index.get_level_values(0).nunique()} patients eligible out of {test.index.get_level_values(0).nunique()}')

----

In [None]:
res2 = res2.astype({'DATA':str})

# Save

csv

In [None]:
res2.to_csv(raw_data_filepath,index=True)

and dict

In [None]:
# where it all begins
data = {}

patients = res2.index.unique()
for idx,p in tqdm(enumerate(patients)):
    
    history = res2.loc[p,'history']
    targets = res2.loc[p,'target']
    delta_days = res2.loc[p,'delta_days']
    date_last_history = res2.loc[p,'DATA']
    
    history = [history] if type(history) != pd.Series else history.tolist()
    targets = [targets] if type(targets) != pd.Series else targets.tolist()
    delta_days = [delta_days] if type(delta_days) != pd.Series else delta_days.tolist()
    date_last_history = [date_last_history] if type(date_last_history) != pd.Series else date_last_history.tolist()
    
    data[p] = { 'ccs': #only ccs for now
               {
                   'history':history,
                   'targets':targets,
                   'extra_features':
                   {
                       'delta_days': delta_days,
                       'date_last_history': date_last_history
                   }
               }
              }

In [None]:
with open(dataset_filepath, 'w') as fp:
    json.dump(data, fp)

----

# test

In [4]:
with open(dataset_filepath, 'r') as fp:
    test_data = json.load(fp)

In [7]:
test_data['0000676389D1EE60EB48AF5693F3F3DE']['ccs']

{'history': [[[670.0]]],
 'targets': [[670.0]],
 'extra_features': {'delta_days': [0.0], 'date_last_history': ['2016-02-28']}}