In [3]:
# IMports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import json
from nltk.corpus import stopwords

### Data Loading and Prep (from eda.ipynb)

In [4]:

SW = stopwords.words('english')
with open('transformed_data/icd10_l2.json','r') as f:
    DOCUMENTS_2 = json.loads(f.read())
    
with open('transformed_data/icd10_l1.json','r') as f:
    DOCUMENTS_1 = json.loads(f.read())

DOCUMENTS_12 = {}
for k,v in DOCUMENTS_2.items():
    k_n1 = '/'.join(k.split('/')[:-1])
    DOCUMENTS_12[k] = v+' '+v  + ' <s> ' + DOCUMENTS_1[k_n1]

doc_to_df = lambda doc:pd.DataFrame({k.split('/')[-1]:[v.lower()] for k,v in doc.items()}).T 

dig_1, dig_2, dig_12 = map(doc_to_df,[DOCUMENTS_1,DOCUMENTS_2,DOCUMENTS_12])


In [5]:
notes = pd.read_csv('Data/notes_cleaned.csv')
notes.set_index('HADM_ID',inplace=True)
notes['DISCHARGE_COALESCE'] = notes['DISCHARGE_PRIMARY'].combine_first(notes['DISCHARGE'])
notes.head()


Unnamed: 0_level_0,HISTORY,DISCHARGE_PRIMARY,DISCHARGE,DISCHARGE_COALESCE
HADM_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107527.0,This is an 81-year-old female with a history o...,,,
167118.0,This 81 year old woman has a history of COPD. ...,,"COPD, Coronary Artery Disease/atypical angina ...","COPD, Coronary Artery Disease/atypical angina ..."
196489.0,"87 yo F with h/o CHF, COPD on 5 L oxygen at ba...",1. Chronic Obstructive Pulmonary Disease Exace...,,1. Chronic Obstructive Pulmonary Disease Exace...
135453.0,Mr. is a 82 year old male who had a slip and ...,,1. Cervical spondylosis with calcification of ...,1. Cervical spondylosis with calcification of ...
170490.0,"is a 62-year-old woman, with longstanding hist...",,brain lesion,brain lesion


In [6]:
diagnoses = pd.read_json('transformed_data/mimic_aggregated_icd10.json')
# we can save a lot of headache by shortening to 3 and cleaning now even though we never did before 
diagnoses['PAIR'] = diagnoses['PAIR'].apply(lambda x: {int(float(k)):v[:3] for k,v in x.items() if v[:3] in dig_12.index})
diagnoses.head()


Unnamed: 0,PAIR
100001,"{1: 'E10', 2: 'G99', 3: 'N17', 4: 'K92', 5: 'Z..."
100003,"{1: 'K25', 2: 'D62', 3: 'B18', 4: 'K74', 5: 'I..."
100006,"{1: 'J44', 2: 'J96', 3: 'J18', 4: 'C90', 5: 'E..."
100007,"{1: 'K56', 2: 'K55', 4: 'J18', 5: 'I10'}"
100009,"{1: 'I25', 2: 'T82', 3: 'I25', 4: 'E11', 5: 'E..."


In [7]:
notes3 = notes.join(diagnoses, how = 'inner')
notes3.head()

Unnamed: 0,HISTORY,DISCHARGE_PRIMARY,DISCHARGE,DISCHARGE_COALESCE,PAIR
107527.0,This is an 81-year-old female with a history o...,,,,"{2: 'J44', 3: 'J96', 4: 'J18', 5: 'E87', 6: 'E..."
167118.0,This 81 year old woman has a history of COPD. ...,,"COPD, Coronary Artery Disease/atypical angina ...","COPD, Coronary Artery Disease/atypical angina ...","{3: 'J44', 4: 'E87', 5: 'I82', 6: 'K44'}"
196489.0,"87 yo F with h/o CHF, COPD on 5 L oxygen at ba...",1. Chronic Obstructive Pulmonary Disease Exace...,,1. Chronic Obstructive Pulmonary Disease Exace...,"{1: 'J96', 2: 'N17', 3: 'G93', 4: 'J44', 5: 'E..."
135453.0,Mr. is a 82 year old male who had a slip and ...,,1. Cervical spondylosis with calcification of ...,1. Cervical spondylosis with calcification of ...,"{1: 'S12', 2: 'J69', 3: 'I50', 4: 'F05', 6: 'W..."
170490.0,"is a 62-year-old woman, with longstanding hist...",,brain lesion,brain lesion,"{1: 'D32', 2: 'M06', 3: 'M35', 4: 'I73', 5: 'K..."


### Preprocessing

In [14]:
#Start - preprocess_tfidf
def preprocess_tfidf(doc_list, vec_params, keep_sparse =False):
    vec = TfidfVectorizer(**vec_params)
    doc_vec = vec.fit_transform(doc_list.values)
    return vec


In [9]:
# Train-Test -Split 
def train_test_split(df, train_pct, split_random_seed:int):
    df_shuffled = df.sample(len(df),random_state = split_random_seed)
    threshhold = int(len(df_shuffled)*train_pct)
    df_shuffled['Split'] = ''
    df_shuffled.iloc[:threshhold,-1] = 'TRAIN'
    df_shuffled.iloc[threshhold:,-1] = 'TEST'
    return df_shuffled


In [17]:
# Y 
def preprocess_y(df_orig, diag_list):
    df = df_orig.copy()
    df['DIAGS'] = df['PAIR'].apply(lambda x: x[min(x.keys())] if x[min(x.keys())] in diag_list else '')
    del df['PAIR']
    cols = list(set(df_orig.columns) - set(['PAIR']))
    return  df.set_index(cols)


### NAIVE BAYES

In [15]:
note3_dc = train_test_split(notes3.dropna(subset = ['DISCHARGE_COALESCE']),
                            train_pct=0.8, split_random_seed=3)
train,test = note3_dc[note3_dc['Split']=='TRAIN'], note3_dc[note3_dc['Split']=='TEST']

In [16]:
self_vec = preprocess_tfidf(train['DISCHARGE_COALESCE'], {'stop_words':SW,'ngram_range':(1,2)}, keep_sparse=True)
X_self_arr = self_vec.transform(train['DISCHARGE_COALESCE'])

In [19]:

y_train = preprocess_y(train,dig_12.index)
y_test = preprocess_y(test,dig_12.index)

### Evaluation

In [20]:
def evaluate_single_prediction_accuracy(labeled_df:pd.DataFrame)-> float:
    labeled_df['DIAG'] = labeled_df['PAIR'].apply(lambda x: x[min(x.keys())])
    return (labeled_df['DIAG'] == labeled_df['PREDICTION']).mean()
def evaluate_in_the_list_accuracy(labeled_df:pd.DataFrame)-> float:
    labeled_df['DIAG'] = labeled_df['PAIR'].apply(lambda x: x.values())
    return (labeled_df.apply(lambda x: x['PREDICTION'] in x['DIAG'],axis=1 )).mean()
def print_accuracy_results(test_df:pd.DataFrame, predictions) -> tuple: 
    X_labeled = test_df.copy()
    X_labeled['PREDICTION'] = predictions
    spa =evaluate_single_prediction_accuracy(X_labeled)
    itla = evaluate_in_the_list_accuracy(X_labeled)
    print('ACCURACY: TOP_DIAGNOSIS:{:.2f}%, IN THE LIST: {:.2f}%'.format(spa*100,itla*100))

In [22]:
lm_model = MultinomialNB()
lm_model.fit(X_self_arr,y_train.values)
#TEST PREPROC
X_test = self_vec.transform(test['DISCHARGE_COALESCE'])
y_hat = lm_model.predict(X_test)
print_accuracy_results(test,y_hat)

  y = column_or_1d(y, warn=True)


ACCURACY: TOP_DIAGNOSIS:28.26%, IN THE LIST: 43.73%
