In [183]:
import sys
import pickle
from datetime import datetime
from collections import Counter
from operator import itemgetter

In [184]:
def convert_to_icd9(dxStr):
	if dxStr.startswith('E'):
		if len(dxStr) > 4: return dxStr[:4] + '.' + dxStr[4:]
		else: return dxStr
	else:
		if len(dxStr) > 3: return dxStr[:3] + '.' + dxStr[3:]
		else: return dxStr
	
def convert_to_3digit_icd9(dxStr):
	if dxStr.startswith('E'):
		if len(dxStr) > 4: return dxStr[:4]
		else: return dxStr
	else:
		if len(dxStr) > 3: return dxStr[:3]
		else: return dxStr

In [185]:
admissionFile = 'MIMIC3/ADMISSIONS.csv'
diagnosisFile = 'MIMIC3/DIAGNOSES_ICD.csv'
drugFile = 'MIMIC3/PRESCRIPTIONS.csv'

In [222]:
icd_converter = 'icd_converter.txt'

In [228]:
icd9_to_icd10 = {}
converter = open(icd_converter, 'r')
converter.readline()
for line in converter:
    icd10 = line[:8].strip(' ')
    icd9 = line[8:14].strip(' ')
    icd9_to_icd10[icd9] = icd10

In [186]:
print('Building admission-year mapping')

admYearMap = {}
infd = open(admissionFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    admTime = datetime.strptime(tokens[3], '%Y-%m-%d %H:%M:%S')
    admYear = admTime.strftime("%Y-%m-%d %H:%M:%S")[:4]
    admYearMap[admId] = admYear
infd.close()

Building admission-year mapping


In [230]:
print('Building admission-dxList mapping')
"""
{admId:[diseas1,diseas2],#icd9
...}
"""
admDxMap = {}
disease_to_icd10 = {}
infd = open(diagnosisFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    origin = tokens[4][1:-1]
    dxStr = 'D_' + convert_to_icd9(origin)
    if origin in icd9_to_icd10:
        disease_to_icd10[dxStr] = icd9_to_icd10[origin]
    if admId in admDxMap: admDxMap[admId].append(dxStr)
    else: admDxMap[admId] = [dxStr]
infd.close()

Building admission-dxList mapping


In [188]:
print('Building admission-drugList mapping')
"""
{admId:[drug1,drug2],#icd9
...}
"""
admDrugMap = {}
infd = open(drugFile, 'r')
infd.readline()
for line in infd:
    tokens = line.strip().split(',')
    admId = int(tokens[2])
    drugStr = str(tokens[12])
    if not drugStr or drugStr == '"0"':
        continue
    if admId in admDrugMap: admDrugMap[admId].append(drugStr)
    else: admDrugMap[admId] = [drugStr]
infd.close()

Building admission-drugList mapping


In [189]:
print('Building visit-year-dxList-drugList mapping; Statistics')
"""
{'visit_id': admId, 'diag_list': [disease1, disease2], 'drug_list':[drug1,drug2], 'visit_year': year}
"""
visit_mappings = []
total_diag_list = []
total_year_list = []
for visit, year in admYearMap.items():
    visit_mapping = {}
    visit_mapping['visit_id'] = visit
    visit_mapping['visit_year'] = year
    if visit in admDxMap and visit in admDrugMap:
        visit_mapping['diag_list'] = list(set(admDxMap[visit]))
        visit_mapping['drug_list'] = list(set(admDrugMap[visit]))
        total_diag_list.extend(visit_mapping['diag_list'])
        total_year_list.append(year)
        visit_mappings.append(visit_mapping)  

Building visit-year-dxList-drugList mapping; Statistics


In [190]:
# Count the frequency of each item
diag_list_count = Counter(total_diag_list)
year_list_count = Counter(total_year_list)

In [191]:
# Sort list according to frequency
sorted_diag_list = sorted(diag_list_count.items(), key=itemgetter(1), reverse=True)
sorted_year_list = sorted(year_list_count.items(), key=itemgetter(1), reverse=True)

In [192]:
# Record the index of sorted diagnosis and drugs
diag_order = {}
for index, diag in enumerate(sorted_diag_list):
    diag_order[diag[0]] = index

In [208]:
# Clean visit data by filtering diseases
visits = []
for visit in visit_mappings:
    flag = True
    for diag in visit['diag_list']:
        if diag_order[diag] > 2000:
            flag = False
            break
    if flag:
        visits.append(visit)

In [209]:
# Split data by year and generate indices
def split_indices(visits, year):
    train = list()
    test = list()
    for i in range(len(visits)):
        if int(visits[i]['visit_year']) >= int(year):
            test.append(i)
        else:
            train.append(i)
    return train, test

In [210]:
train, test = split_indices(visits, '2181')

In [211]:
X_train = [visits[i] for i in train]
total_drug_list = []
for visit in X_train:
    total_drug_list.extend(visit['drug_list'])

In [212]:
drug_list_count = Counter(total_drug_list)
sorted_drug_list = sorted(drug_list_count.items(), key=itemgetter(1), reverse=True)
drug_order = {}
for index, drug in enumerate(sorted_drug_list):
    drug_order[drug[0]] = index

In [213]:
# Write files
with open('clean_data_leap/visit_mappings.pkl', 'wb') as f:
        pickle.dump(visits, f)

with open('clean_data_leap/sorted_diag_list.pkl', 'wb') as f:
        pickle.dump(sorted_diag_list, f)

with open('clean_data_leap/sorted_drug_list.pkl', 'wb') as f:
        pickle.dump(sorted_drug_list, f)
        
with open('clean_data_leap/diag_order.pkl', 'wb') as f:
        pickle.dump(diag_order, f)

with open('clean_data_leap/drug_order.pkl', 'wb') as f:
        pickle.dump(drug_order, f)

with open('clean_data_leap/diag_list.txt', "w") as f:
    for diag in sorted_diag_list:
        f.write("%-20s%-20d\n" %(diag[0], diag[1]))

with open('clean_data_leap/drug_list.txt', "w") as f:
    for drug in sorted_drug_list:
        f.write("%-45s%-20d\n" %(drug[0], drug[1]))

In [214]:
sorted_year_list = sorted(year_list_count.items(), key=itemgetter(0))
with open('clean_data_leap/year_list.txt', "w") as f:
    for year in sorted_year_list:
        f.write("%-10s%-20d\n" %(year[0], year[1]))

In [218]:
len(visit_mappings)

50209

In [162]:
visits = []
for visit in visit_mappings:
    diag_list = visit['diag_list']
    drug_list = visit['drug_list']
    visit_year = visit['visit_year']
    visits.append([diag_list, drug_list, visit_year])

In [164]:
with open('mimic_episodes.pkl', 'wb') as f:
        pickle.dump(visits, f, protocol=2)

In [165]:
len(visits)

50209

In [231]:
with open('clean_data_leap/disease_to_icd10.pkl', 'wb') as f:
        pickle.dump(disease_to_icd10, f)