In [19]:
import pandas as pd

labelmap=pd.read_csv('../data/intermediary-data/xbert_inputs/label_map.txt',sep='\t',header=None)
labelmap

Unnamed: 0,0
0,"Tuberculous pneumonia [any form], tubercle bac..."
1,"Tuberculous pneumothorax, unspecified"
2,"Tuberculous pneumothorax, bacteriological or h..."
3,"Tuberculous pneumothorax, bacteriological or h..."
4,"Tuberculous pneumothorax, tubercle bacilli fou..."
...,...
14562,Special screening examination for unspecified ...
14563,Screening examination for cholera
14564,Screening examination for pulmonary tuberculosis
14565,Screening examination for leprosy (Hansen's di...


In [2]:
from pathlib import Path

import typing as t
import re
import pandas as pd
import scipy
import yaml
import numpy as np
from loguru import logger
from tqdm import tqdm

import scipy.sparse

import sys
sys.path.append(".")
# from icd9 import ICD9


# try:
#     import format_data_for_training #script from auto-icd
# except ImportError:
#     # when running in a pytest context
import format_data_for_training

# input filepaths.
DIAGNOSIS_CSV_FP = "./data/mimiciii-14/DIAGNOSES_ICD.csv.gz"
PROCEDURES_CSV_FP = "./data/mimiciii-14/PROCEDURES_ICD.csv"
ICD9_DIAG_KEY_FP = "./data/mimiciii-14/D_ICD_DIAGNOSES.csv.gz"
ICD9_PROC_KEY_FP = "./data/mimiciii-14/D_ICD_PROCEDURES.csv"

# ICD_GEM_FP = "./data/ICD_general_equivalence_mapping.csv" #for conversion to ICd10

# output filepaths
XBERT_LABEL_MAP_FP = './data/intermediary-data/xbert_inputs/label_map.txt'
XBERT_TRAIN_RAW_TEXTS_FP = './data/intermediary-data/xbert_inputs/train_raw_texts.txt'
XBERT_VAL_RAW_TEXTS_FP = './data/intermediary-data/xbert_inputs/val_raw_texts.txt'
XBERT_TEST_RAW_TEXTS_FP = './data/intermediary-data/xbert_inputs/test_raw_texts.txt'

XBERT_Y_TRN_FP = './data/intermediary-data/xbert_inputs/Y.trn.npz'
XBERT_Y_VAL_FP = './data/intermediary-data/xbert_inputs/Y.val.npz'
XBERT_Y_TST_FP = './data/intermediary-data/xbert_inputs/Y.tst.npz'
DF_TRAIN_FP ='./data/intermediary-data/df_train.pkl'
DF_TEST_FP = './data/intermediary-data/df_test.pkl'


def main():
    with open('params.yaml', 'r') as f:
        params = yaml.safe_load(f.read())
    icd_version_specified = str(params['prepare_for_xbert']['icd_version'])
    diag_or_proc_param = params['prepare_for_xbert']['diag_or_proc']
    assert diag_or_proc_param == 'proc' or diag_or_proc_param == 'diag', 'Must specify either \'proc\' or \'diag\'.'
    note_category_param = params['prepare_for_xbert']['note_category']
    icd_seq_num_param = params['prepare_for_xbert']['one_or_all_icds']
    subsampling_param = params['prepare_for_xbert']['subsampling']

    logger.info(f'Using ICD version {icd_version_specified}...')
    assert icd_version_specified == '9' or icd_version_specified == '10', 'Must specify one of ICD9 or ICD10.'
    logger.info('Reformatting raw data with subsampling {}', 'enabled' if subsampling_param else 'disabled')

    df_train, df_val, df_test = \
        format_data_for_training.construct_datasets(
            diag_or_proc_param, note_category_param, subsampling_param)


    # X_trn = xbert_prepare_txt_inputs(df_train, 'training')
    # X_val = xbert_prepare_txt_inputs(df_val, 'validation')
    # X_tst = xbert_prepare_txt_inputs(df_test, 'testing')


    icd_labels, desc_labels = xbert_create_label_map(icd_version_specified, diag_or_proc_param)
    #need codes, not descriptions right now.

    icd_labels.to_csv('data/icd_labels.csv')

    data_train = xbert_prepare_Y_maps(
        df_train, icd_labels.tolist(), icd_version_specified)

    data_val = xbert_prepare_Y_maps(
        df_val, icd_labels.tolist(), icd_version_specified)

    data_test = xbert_prepare_Y_maps(
        df_test, icd_labels.tolist(), icd_version_specified)

    logger.info('Created dataframes of hospital admission to assigned ICD codes.')

    data_train = data_train.insert(loc=0,column='TEXT',value= df_train['TEXT'])
    data_val  = data_val.insert(loc=0,column='TEXT',value= df_val['TEXT'])
    data_test = data_test.insert(loc=0,column='TEXT',value= df_test['TEXT'])


    logger.info(
        'Done preprocessing for multilabel classification. Saving CSV of columns [text, code1,code2,...,codeL].'
    )

FileNotFoundError: [Errno 2] No such file or directory: 'params.yaml'

In [39]:
train_data = pd.read_pickle('/Users/simon/GitHub/11785-project/data/intermediary-data/notes2diagnosis-icd-train_df').drop('CATEGORY',axis=1)
test_data = pd.read_pickle('/Users/simon/GitHub/11785-project/data/intermediary-data/notes2diagnosis-icd-test_df').drop('CATEGORY',axis=1)


Unnamed: 0_level_0,TEXT,ICD9_CODE
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
188354.0,: : : Sex: F Service: MEDICINE Allergies: Sulf...,44023
104471.0,: : : Sex: F Service: CARDIOTHORACIC Allergies...,4240
157504.0,: : : Sex: M Service: NBB HISTORY OF PRESENT I...,7746
157762.0,: : : Sex: F Service: NEONATOLOGY HISTORY: wee...,V3001
109320.0,: : Service: MEDICINE Allergies: Patient recor...,41071
...,...,...
183203.0,: : : Sex: M Service: MEDICINE Allergies: Peni...,42823
121367.0,: : : Sex: M Service: Neonatology HISTORY: . i...,V3001
137842.0,: : : Sex: M Service: SURGERY Allergies: Demer...,1550
167746.0,: : Service: GU Allergies: Amoxicillin / Aldom...,59010


In [40]:
pd.read_csv('/Users/simon/GitHub/11785-project/data/intermediary-data/notes2diagnosis-icd-train.csv')

Unnamed: 0,HADM_ID,TEXT,CATEGORY,ICD9_CODE
0,188044.0,: : : Sex: M Service: Cardiothoracic HISTORY O...,Discharge summary,99859
1,105949.0,"Name: , Unit No: : : : Sex: F Service: This is...",Discharge summary,85102
2,124546.0,: : : Sex: F Service: MEDICINE Allergies: Sulf...,Discharge summary,5789
3,112611.0,: : : Sex: M Service: CARDIOTHORACIC Allergies...,Discharge summary,41401
4,162732.0,: : : Sex: M Service: NB was born at -/ weeks ...,Discharge summary,V3000
...,...,...,...,...
75,131460.0,: : : Sex: F Service: MICU CHIEF COMPLAINT: Sh...,Discharge summary,51881
76,120068.0,: : : Sex: F Service: MEDICINE Allergies: Zolp...,Discharge summary,99662
77,112776.0,: : : Sex: F Service: SURGERY Allergies: Cepha...,Discharge summary,5570
78,132480.0,: : : Sex: F Service: SURGERY Allergies: Patie...,Discharge summary,4280


In [23]:
instance2label=pd.DataFrame(scipy.sparse.load_npz('../data/intermediary-data/xbert_inputs/Y.trn.npz').todense())
instance2label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14557,14558,14559,14560,14561,14562,14563,14564,14565,14566
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30910,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30911,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
instance2label

In [31]:
trn_texts = pd.read_csv('/Users/simon/GitHub/11785-project/data/intermediary-data/xbert_inputs/train_raw_texts.txt',header=None,sep='\t')
trn_texts

Unnamed: 0,0
0,: : : Sex: F Service: MEDICINE Allergies: Sulf...
1,: : : Sex: F Service: CARDIOTHORACIC Allergies...
2,: : : Sex: M Service: NBB HISTORY OF PRESENT I...
3,: : : Sex: F Service: NEONATOLOGY HISTORY: wee...
4,: : Service: MEDICINE Allergies: Patient recor...
...,...
30910,: : : Sex: M Service: MEDICINE Allergies: Peni...
30911,: : : Sex: M Service: Neonatology HISTORY: . i...
30912,: : : Sex: M Service: SURGERY Allergies: Demer...
30913,: : Service: GU Allergies: Amoxicillin / Aldom...


In [None]:
y = instance2label