In [1]:
# Similar patients
# Time series can be applied with two approaches: 1. similarity-based(distance) and 2. feature-based

In [2]:
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
import numpy

In [3]:
MOST_SIMILAR_PAT_NUM = 500

## Diagnoses

In [4]:
diagnoses = pd.read_csv('/Users/grace/mimic/csv/DIAGNOSES_ICD.csv', sep=',')
diagnoses[diagnoses.ICD9_CODE.isin(['77181', '99591', '99592', '67020', '67022', '67024'])].shape

(5409, 5)

In [5]:
sepsis_patients = diagnoses[diagnoses.ICD9_CODE.isin(['77181', '99591', '99592', '67020', '67022', '67024'])]['SUBJECT_ID'].unique()
sepsis_patients

array([  117,   124,    64, ..., 95803, 97143, 97158])

In [6]:
#seed patient
unique_d_for_25030 = diagnoses[diagnoses.SUBJECT_ID==25030]['ICD9_CODE'].unique()
unique_d_for_25030

array(['0389', '4275', '78551', '4260', '4210', '40391', '25041', '2767',
       'V4975', '99592', '41092', '99662', '03842', '2869', '99681',
       '4254', '2851', 'V5867', '51889', '7904', '71941', '0383', '70703',
       '78552', '07070', '28521', '40301', '43491', '4372', '2720',
       '25051', '2761', '36201', '431', '3314', '7070', '25081', '78039',
       '51881'], dtype=object)

In [7]:
def get_patient_info(groupby_col, collection_col, target_df):
    patients = {}
    for i, grp in target_df.groupby(groupby_col):
    #     print(i)
        if i == 25030:
            continue
        else:
            patients[i] = list(map(lambda x: str(x), grp[collection_col].unique()))
    return patients

patients = get_patient_info('SUBJECT_ID', 'ICD9_CODE', diagnoses[diagnoses.SUBJECT_ID.isin(sepsis_patients)])

In [8]:
def jaccard_index(first_set, second_set):
    """ Computes jaccard index of two sets
        Arguments:
          first_set(set):
          second_set(set):
        Returns:
          index(float): Jaccard index between two sets; it is 
            between 0.0 and 1.0
    """
    # If both sets are empty, jaccard index is defined to be 1
    index = 1.0
    if first_set or second_set:
        index = (float(len(first_set.intersection(second_set))) 
             / len(first_set.union(second_set)))

    return index

In [9]:
first_set = set(patients[21])
second_set = set(patients[38])
index = jaccard_index(first_set, second_set)
print(index)

0.09523809523809523


In [10]:
len(patients)

4780

In [11]:
#find similar patients to 25030

d_for_25030 = set(unique_d_for_25030)

def compute_jaccard(base_set, rest_dic):
    jac_dic = []
    for i in rest_dic.keys():
#         print(base_set.intersection(set(rest_dic[i])))
        jac_dic.append({'SUBJECT_ID': i,
        'jaccard_index': jaccard_index(base_set, set(rest_dic[i])),
        'set':rest_dic[i]})

    return jac_dic

In [12]:
jac_dic = compute_jaccard(d_for_25030, patients)

In [13]:
jac_df = pd.DataFrame(jac_dic)
# jac_df.head()

In [14]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)  

In [15]:
jac_df[jac_df.jaccard_index>0.1].sort_values(by='jaccard_index', ascending=False)[:5]

Unnamed: 0,SUBJECT_ID,jaccard_index,set
1038,12733,0.208333,"[44023, 4280, 5856, 40391, 70714, 25000, V1082, 0389, 51881, 42842, 78552, 4271, 41071, 00845, 72886, 7854, 99592, 2720, V4975, 4439, V4581, 412, 70705, 70703, 70720, 0272, 1125, 71103, 99662, 6824, 99702, 43491, 4275, 6822, 570, 70723, 94421, E9248, 28521, 43310, V5867, 25080, 25050, 36201, 99769, 5789, 2851, 4589]"
146,1795,0.203704,"[0389, 41071, 4280, 78552, 40391, 5856, 44024, 2761, 4254, 5119, 570, 431, 2869, 25070, 99592, 4589, 41401, 25060, 25050, 5363, 36201, 2720, 04111, 73300, V5867, V090]"
1404,17564,0.2,"[99662, 03811, 99592, 78552, 5990, 70714, 40391, 5856, 2761, 4538, 2851, 00845, 0417, 44023, 25040, V5867, 45184, 73028, 72290, 4592, 4280, 0389, 51881, 2767, 5070, 70703, 37601]"
4185,82512,0.190476,"[4241, 5845, 0389, 99592, 51881, 486, 4254, 42822, 70714, 4271, 2851, 99681, 5119, 2762, 2761, V707, V4501, V4973, 4019, 2720, 4168, 4439, V1251, 41401, V4582, 42731, 4280, 32723, 58381, 25041, 4275, 42741, 2875, E8780, 78551, 2767]"
1568,19632,0.175439,"[431, 496, 4280, 3314, 4439, 25000, 53081, 4019, 99662, 03811, 99592, 51881, 78552, 40391, 2874, 4538, V4581, 28521, 25060, 3572, 25040, 4271, 2639, 2869, V090, V4975, V4976, V180]"


In [16]:
similar_by_d = jac_df[jac_df.jaccard_index>0.1].sort_values(by='jaccard_index', ascending=False).SUBJECT_ID.unique()[:MOST_SIMILAR_PAT_NUM]
# similar_by_d

## Lab

In [17]:
labs = pd.read_csv('/Users/grace/mimic/csv/LABEVENTS.csv', sep=',')
# labs.head()

In [18]:
lab_items = pd.read_csv('/Users/grace/mimic/csv/D_LABITEMS.csv', sep=',')
# lab_items.head()

In [19]:
lab_patients = get_patient_info('SUBJECT_ID', 'ITEMID', labs)

In [20]:
#find similar patients to 25030

unique_lab_for_25030 = set(map(lambda x: str(x), labs[labs.SUBJECT_ID==25030]['ITEMID'].unique()))
jac_dic_lab = compute_jaccard(unique_lab_for_25030, lab_patients)

In [21]:
jac_lab_df = pd.DataFrame(jac_dic_lab)
# jac_lab_df[jac_lab_df.jaccard_index>0.0].sort_values(by='jaccard_index', ascending=False)[:10]
similar_by_lab = jac_lab_df[jac_lab_df.jaccard_index>0.0].sort_values(by='jaccard_index', ascending=False).SUBJECT_ID.unique()[:MOST_SIMILAR_PAT_NUM]

## Procedure

In [22]:
proc = pd.read_csv('/Users/grace/mimic/csv/PROCEDURES_ICD.csv', sep=',')
# proc.head()

In [23]:
proc_patients = get_patient_info('SUBJECT_ID', 'ICD9_CODE', proc)

In [24]:
#find similar patients to 25030

unique_proc_for_25030 = set(map(lambda x: str(x), proc[proc.SUBJECT_ID==25030]['ICD9_CODE'].unique()))
jac_dic_proc = compute_jaccard(unique_proc_for_25030, proc_patients)

In [25]:
jac_proc_df = pd.DataFrame(jac_dic_proc)
# jac_proc_df[jac_proc_df.jaccard_index>0.0].sort_values(by='jaccard_index', ascending=False)[:10]
similar_by_proc = jac_proc_df[jac_proc_df.jaccard_index>0.0].sort_values(by='jaccard_index', ascending=False).SUBJECT_ID.unique()[:MOST_SIMILAR_PAT_NUM]

## Prescription

In [26]:
pres = pd.read_csv('/Users/grace/mimic/csv/PRESCRIPTIONS.csv', sep=',')
# pres.head()


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



In [27]:
pres.isna().sum()

ROW_ID               0      
SUBJECT_ID           0      
HADM_ID              0      
ICUSTAY_ID           1447708
STARTDATE            3182   
ENDDATE              5421   
DRUG_TYPE            0      
DRUG                 0      
DRUG_NAME_POE        1664234
DRUG_NAME_GENERIC    1662989
FORMULARY_DRUG_CD    1933   
GSN                  507164 
NDC                  4463   
PROD_STRENGTH        1362   
DOSE_VAL_RX          1350   
DOSE_UNIT_RX         1342   
FORM_VAL_DISP        1355   
FORM_UNIT_DISP       1409   
ROUTE                1156   
dtype: int64

In [28]:
pres_patients = get_patient_info('SUBJECT_ID', 'DRUG', pres)

In [29]:
#find similar patients to 25030

unique_pres_for_25030 = set(map(lambda x: str(x), pres[pres.SUBJECT_ID==25030]['DRUG'].unique()))
jac_dic_pres = compute_jaccard(unique_pres_for_25030, pres_patients)

In [30]:
jac_pres_df = pd.DataFrame(jac_dic_pres)
# jac_pres_df[jac_pres_df.jaccard_index>0.0].sort_values(by='jaccard_index', ascending=False)[:10]
similar_by_pres = jac_pres_df[jac_pres_df.jaccard_index>0.0].sort_values(by='jaccard_index', ascending=False).SUBJECT_ID.unique()[:MOST_SIMILAR_PAT_NUM]

## 가장 비슷한 환자 Top 500 중 4개 모든 항목에 포함된 환자

In [31]:
set(similar_by_d).intersection(set(similar_by_lab)).intersection(set(similar_by_proc)).intersection(set(similar_by_pres))

{1982, 6613, 17863, 17977, 26139}

In [38]:
### diagnoses + procedure
print(set(similar_by_d).intersection(set(similar_by_proc)))

### diagnoses + lab
print(set(similar_by_d).intersection(set(similar_by_lab)))

### diagnoses + prescription
print(set(similar_by_d).intersection(set(similar_by_pres)))

### lab + procedure
print(set(similar_by_lab).intersection(set(similar_by_proc)))

### lab + prescription
print(set(similar_by_lab).intersection(set(similar_by_pres)))

### prescription + procedure
print(set(similar_by_pres).intersection(set(similar_by_proc)))

{27905, 3841, 17795, 17668, 15875, 19592, 19851, 22283, 3852, 14990, 17423, 27280, 22289, 7062, 14873, 3866, 26139, 5281, 26274, 28457, 86314, 19632, 14641, 17330, 29872, 50743, 17977, 81593, 1982, 17863, 77383, 9800, 588, 1356, 11724, 12237, 65232, 8915, 6613, 26709, 17112, 16351, 26208, 21990, 96232, 19177, 26601, 26219, 23533, 9206, 22394, 32763, 6908}
{15749, 25225, 3722, 19851, 14990, 17423, 16014, 5909, 14873, 29466, 26139, 6428, 3866, 8734, 5666, 24995, 19620, 26274, 20133, 2984, 29866, 4655, 433, 90802, 9266, 18996, 11318, 21431, 12856, 17977, 1982, 24510, 5824, 20546, 1988, 7621, 4678, 17863, 18252, 12237, 6613, 54229, 16855, 2136, 13401, 19038, 24032, 353, 4577, 4962, 21990, 13033, 21613, 7666, 13938, 26868, 9206, 11003}
{3841, 899, 1544, 7062, 4760, 26139, 17564, 1436, 8734, 19236, 19620, 9768, 16558, 1967, 9266, 11318, 21431, 17977, 6334, 1982, 24510, 20546, 323, 7237, 20678, 17863, 19911, 9800, 22475, 18252, 15057, 15570, 6613, 16855, 2136, 18649, 1113, 1114, 3804, 3294, 2