In [1]:
# Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import json
from nltk.corpus import stopwords

### Data Loading and Prep (from eda.ipynb)

In [2]:

SW = stopwords.words('english')
with open('transformed_data/icd10_l2.json','r') as f:
    DOCUMENTS_2 = json.loads(f.read())
    
with open('transformed_data/icd10_l1.json','r') as f:
    DOCUMENTS_1 = json.loads(f.read())

DOCUMENTS_12 = {}
for k,v in DOCUMENTS_2.items():
    k_n1 = '/'.join(k.split('/')[:-1])
    DOCUMENTS_12[k] = v+' '+v  + ' <s> ' + DOCUMENTS_1[k_n1]

doc_to_df = lambda doc:pd.DataFrame({k.split('/')[-1]:[v.lower()] for k,v in doc.items()}).T 

dig_1, dig_2, dig_12 = map(doc_to_df,[DOCUMENTS_1,DOCUMENTS_2,DOCUMENTS_12])


In [3]:
notes = pd.read_csv('Data/notes_cleaned.csv')
notes.set_index('HADM_ID',inplace=True)
notes['DISCHARGE_COALESCE'] = notes['DISCHARGE_PRIMARY'].combine_first(notes['DISCHARGE'])
notes.head()


Unnamed: 0_level_0,HISTORY,DISCHARGE_PRIMARY,DISCHARGE,DISCHARGE_COALESCE
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107527.0,This is an 81-year-old female with a history o...,,,
167118.0,This 81 year old woman has a history of COPD. ...,,"COPD, Coronary Artery Disease/atypical angina ...","COPD, Coronary Artery Disease/atypical angina ..."
196489.0,"87 yo F with h/o CHF, COPD on 5 L oxygen at ba...",1. Chronic Obstructive Pulmonary Disease Exace...,,1. Chronic Obstructive Pulmonary Disease Exace...
135453.0,Mr. is a 82 year old male who had a slip and ...,,1. Cervical spondylosis with calcification of ...,1. Cervical spondylosis with calcification of ...
170490.0,"is a 62-year-old woman, with longstanding hist...",,brain lesion,brain lesion


In [4]:
diagnoses = pd.read_json('transformed_data/mimic_aggregated_icd10.json')
# we can save a lot of headache by shortening to 3 and cleaning now even though we never did before 
diagnoses['PAIR'] = diagnoses['PAIR'].apply(lambda x: {int(float(k)):v[:3] for k,v in x.items() if v[:3] in dig_12.index})
diagnoses.head()


Unnamed: 0,PAIR
100001,"{1: 'E10', 2: 'G99', 3: 'N17', 4: 'K92', 5: 'Z..."
100003,"{1: 'K25', 2: 'D62', 3: 'B18', 4: 'K74', 5: 'I..."
100006,"{1: 'J44', 2: 'J96', 3: 'J18', 4: 'C90', 5: 'E..."
100007,"{1: 'K56', 2: 'K55', 4: 'J18', 5: 'I10'}"
100009,"{1: 'I25', 2: 'T82', 3: 'I25', 4: 'E11', 5: 'E..."


In [5]:
notes3 = notes.join(diagnoses, how = 'inner')
notes3.head()

Unnamed: 0,HISTORY,DISCHARGE_PRIMARY,DISCHARGE,DISCHARGE_COALESCE,PAIR
107527.0,This is an 81-year-old female with a history o...,,,,"{2: 'J44', 3: 'J96', 4: 'J18', 5: 'E87', 6: 'E..."
167118.0,This 81 year old woman has a history of COPD. ...,,"COPD, Coronary Artery Disease/atypical angina ...","COPD, Coronary Artery Disease/atypical angina ...","{3: 'J44', 4: 'E87', 5: 'I82', 6: 'K44'}"
196489.0,"87 yo F with h/o CHF, COPD on 5 L oxygen at ba...",1. Chronic Obstructive Pulmonary Disease Exace...,,1. Chronic Obstructive Pulmonary Disease Exace...,"{1: 'J96', 2: 'N17', 3: 'G93', 4: 'J44', 5: 'E..."
135453.0,Mr. is a 82 year old male who had a slip and ...,,1. Cervical spondylosis with calcification of ...,1. Cervical spondylosis with calcification of ...,"{1: 'S12', 2: 'J69', 3: 'I50', 4: 'F05', 6: 'W..."
170490.0,"is a 62-year-old woman, with longstanding hist...",,brain lesion,brain lesion,"{1: 'D32', 2: 'M06', 3: 'M35', 4: 'I73', 5: 'K..."


### Preprocess

In [6]:
# #Start - preprocess_tfidf
# def preprocess_tfidf(doc_list, vec_params, keep_sparse =False):
#     vec = TfidfVectorizer(**vec_params)
#     doc_vec = vec.fit_transform(doc_list.values)
#     return vec
# # Train-Test -Split 
# def train_test_split(df, train_pct, split_random_seed:int):
#     df_shuffled = df.sample(len(df),random_state = split_random_seed)
#     threshhold = int(len(df_shuffled)*train_pct)
#     df_shuffled['Split'] = ''
#     df_shuffled.iloc[:threshhold,-1] = 'TRAIN'
#     df_shuffled.iloc[threshhold:,-1] = 'TEST'
#     return df_shuffled

# # Y # Y 
# def preprocess_y(df_orig, diag_list, top_only = True, dummy = True):
#     df = df_orig.copy()
#     if top_only:
#         #Changed after PAIR refactor (see above)
#         fn = (lambda x: [x]) if  dummy else (lambda x: x)
#         df['DIAGS'] = df['PAIR'].apply(lambda x: fn(x[min(x.keys())]) if x[min(x.keys())] in diag_list else '')
#     else:
#         df['DIAGS'] = df['PAIR'].apply(lambda x: list( set(x.values()).intersection(set(diag_list))))
#     del df['PAIR']
#     cols = list(set(df_orig.columns) - set(['PAIR']))
#     if dummy:
#         output =  df.set_index(cols).DIAGS.str.join('|').str.get_dummies()
#         return output.reindex(output.columns.union(diag_list, sort=None), axis=1, fill_value=0)
#     else:
#         return  df.set_index(cols)

# first_diag = lambda x: x[min(x.keys())]


In [7]:
from model import preprocess
from model import postmortem
from model import postprocess
from model import evaluate

In [8]:
note3_dc = preprocess.train_test_split(notes3.dropna(subset = ['DISCHARGE_COALESCE']), train_pct=0.8, split_random_seed=3)
train,test = note3_dc[note3_dc['Split']=='TRAIN'], note3_dc[note3_dc['Split']=='TEST']

In [9]:
tr_diag = set(train['PAIR'].apply(preprocess.first_diag))
te_diag = set(test['PAIR'].apply(preprocess.first_diag))
train_diags = set(train['PAIR'].apply(preprocess.first_diag).values)
test_diags = set(test['PAIR'].apply(preprocess.first_diag).values )

danger = test_diags - train_diags
test = test[test['PAIR'].apply(preprocess.first_diag).apply(lambda x: x not in danger)]
len(danger)

43

In [10]:
self_vec = preprocess.preprocess_tfidf(train['DISCHARGE_COALESCE'], {'stop_words':SW,'ngram_range':(1,2)}, keep_sparse=True)
X_train, X_test = [self_vec.transform(df['DISCHARGE_COALESCE']) for df in [train,test]]
y_train, y_test = [preprocess.preprocess_y(df,dig_12.index) for df in [train,test]]

In [11]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=15, n_jobs=5, max_features="sqrt", verbose = 3)
rf.fit(X_train,y_train)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.


building tree 1 of 15
building tree 2 of 15
building tree 3 of 15
building tree 4 of 15
building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15


[Parallel(n_jobs=5)]: Done  12 out of  15 | elapsed:  2.3min remaining:   35.1s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:  2.4min finished


RandomForestRegressor(max_features='sqrt', n_estimators=15, n_jobs=5, verbose=3)

In [12]:
y_hat = rf.predict(X_test)

[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  12 out of  15 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  15 out of  15 | elapsed:    0.6s finished


In [13]:
# def argmax_to_one(y:np.array):
#     y_hat_rf_max = np.zeros(y.shape)
#     for i,doc in enumerate(y):
#         y_hat_rf_max[i][np.argmax(doc)] = 1
#     return y_hat_rf_max

In [14]:
# def text_dot_product(matrix:np.array,col_names:list):
#     predictions = []
#     for enc in matrix:
#         pred = ''
#         for doc,val  in zip(col_names, enc):
#             pred += doc*int(val)
#         predictions.append(pred)
#     return predictions

In [15]:
# def evaluate_single_prediction_accuracy(labeled_df:pd.DataFrame)-> float:
#     labeled_df['DIAG'] = labeled_df['PAIR'].apply(lambda x: x[min(x.keys())])
#     return (labeled_df['DIAG'] == labeled_df['PREDICTION']).mean()
# def evaluate_in_the_list_accuracy(labeled_df:pd.DataFrame)-> float:
#     labeled_df['DIAG'] = labeled_df['PAIR'].apply(lambda x: x.values())
#     return (labeled_df.apply(lambda x: x['PREDICTION'] in x['DIAG'],axis=1 )).mean()
# def print_accuracy_results(test_df:pd.DataFrame, predictions) -> tuple: 
#     X_labeled = test_df.copy()
#     X_labeled['PREDICTION'] = predictions
#     spa =evaluate_single_prediction_accuracy(X_labeled)
#     itla = evaluate_in_the_list_accuracy(X_labeled)
#     print('ACCURACY: TOP_DIAGNOSIS:{:.2f}%, IN THE LIST: {:.2f}%'.format(spa*100,itla*100))

In [16]:
rf = RandomForestRegressor(n_estimators=25, n_jobs=5, max_features="sqrt", verbose = 3)
rf.fit(X_train,y_train)
y_hat_25_sqrt = rf.predict(X_test)


[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.


building tree 1 of 25building tree 2 of 25

building tree 3 of 25
building tree 4 of 25
building tree 5 of 25
building tree 6 of 25
building tree 7 of 25
building tree 8 of 25
building tree 9 of 25
building tree 10 of 25
building tree 11 of 25
building tree 12 of 25
building tree 13 of 25
building tree 14 of 25
building tree 15 of 25
building tree 16 of 25
building tree 17 of 25
building tree 18 of 25
building tree 19 of 25
building tree 20 of 25
building tree 21 of 25
building tree 22 of 25
building tree 23 of 25
building tree 24 of 25
building tree 25 of 25


[Parallel(n_jobs=5)]: Done  25 out of  25 | elapsed:  4.6min remaining:    0.0s
[Parallel(n_jobs=5)]: Done  25 out of  25 | elapsed:  4.6min finished
[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  25 out of  25 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=5)]: Done  25 out of  25 | elapsed:    1.1s finished


In [17]:
inv_25sqrt = test.copy()
inv_25sqrt['PREDICTION'] =  postprocess.text_dot_product(postprocess.argmax_to_one(y_hat_25_sqrt),y_test.columns)
pmt = postmortem.failed_partial_success(inv_25sqrt)
pmt.head()

Unnamed: 0,HISTORY,DISCHARGE_PRIMARY,DISCHARGE,DISCHARGE_COALESCE,PAIR,Split,PREDICTION,PRIMARY,DIAG_LIST,PRIMARY_MATCH,ONE_IN_LIST_MATCH,STATUS
159804.0,Patient is a 82 yo LHW with hx of HTN and hype...,,Left parieto-occipital intracerebral hemorrhag...,Left parieto-occipital intracerebral hemorrhag...,"{1: 'I61', 2: 'G93', 3: 'G93', 4: 'E85', 5: 'I...",TEST,I61,I61,"[I61, G93, G93, E85, I48, M48]",True,True,True
100884.0,HPI: Mr. is a 57 year-old man with hx of ESRD...,,Fever New pericardial effusion End-stage renal...,Fever New pericardial effusion End-stage renal...,"{1: 'I30', 2: 'T82', 3: 'N18', 4: 'I12', 5: 'B...",TEST,Z49,I30,"[I30, T82, N18, I12, B18, Y83, E11, K22, D63, ...",False,False,False
178240.0,This 71 year old white male with known coronar...,,Coronary Artery Disease with tight left main d...,Coronary Artery Disease with tight left main d...,"{1: 'I25', 2: 'I20', 4: 'J95', 5: 'J44', 6: 'E...",TEST,I25,I25,"[I25, I20, J95, J44, E11, I25, R50, K22, I10, ...",True,True,True
151280.0,HPI: (obtained with assistance from his son an...,,SDH,SDH,"{1: 'S06', 2: 'J69', 3: 'W19', 4: 'Y92', 5: 'I...",TEST,S06,S06,"[S06, J69, W19, Y92, I48, I25, Z95, I10, Z85, ...",True,True,True
137307.0,52F with history and physical consistent with ...,,Right knee osteoarthritis,Right knee osteoarthritis,"{13: 'Y83', 1: 'M17', 3: 'I97', 4: 'I24', 5: '...",TEST,M17,M17,"[Y83, M17, I97, I24, E87, I45, I10, F32, K21, ...",True,True,True


In [18]:
train['PAIR'].apply(preprocess.first_diag).value_counts()

A41    1774
I25    1613
I21    1462
J96     833
I50     773
       ... 
G45       1
R62       1
C44       1
I73       1
T54       1
Name: PAIR, Length: 729, dtype: int64

In [19]:
evaluate.print_accuracy_results(test, postprocess.text_dot_product( postprocess.argmax_to_one(y_hat_25_sqrt),y_test.columns))

ACCURACY: TOP_DIAGNOSIS:43.81%, IN THE LIST: 68.36%
