In [1]:
# IMports
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import json
from nltk.corpus import stopwords

#### Data Loading 

In [2]:
SW = stopwords.words('english')
with open('transformed_data/icd10_l2.json','r') as f:
    DOCUMENTS_2 = json.loads(f.read())
    
with open('transformed_data/icd10_l1.json','r') as f:
    DOCUMENTS_1 = json.loads(f.read())

DOCUMENTS_12 = {}
for k,v in DOCUMENTS_2.items():
    k_n1 = '/'.join(k.split('/')[:-1])
    DOCUMENTS_12[k] = v+' '+v  + ' <s> ' + DOCUMENTS_1[k_n1]

In [3]:
doc_to_df = lambda doc:pd.DataFrame({k.split('/')[-1]:[v.lower()] for k,v in doc.items()}).T 

dig_1, dig_2, dig_12 = map(doc_to_df,[DOCUMENTS_1,DOCUMENTS_2,DOCUMENTS_12])


In [4]:
NOTES = 'Data/notes_cleaned.csv'
DIAGS = 'transformed_data/mimic_aggregated_icd10.json'
s1 = pd.read_csv(NOTES)
s2 = pd.read_json(DIAGS).reset_index()
s2.columns = ['HADM_ID','PAIR']
BASE = s1.merge(s2, on = 'HADM_ID',how = 'inner')
BASE_HISTORY = BASE.dropna(subset = ['HISTORY'])

### Cosine similarity metrics

In [5]:
def model(vec_params:dict, diag_list:pd.Series, df:pd.DataFrame, note_column:str, df_dropna_subset:dict) :
    df_full = df.dropna(subset=df_dropna_subset) 
    vec = TfidfVectorizer(**vec_params)
    doc_vec = vec.fit_transform(diag_list.values)
    cols = vec.get_feature_names_out()
    X_tr = vec.transform(df_full[note_column])
    y_p = [list(diag_list.index)[x] for x in [np.argmax(row) for row in X_tr.dot(doc_vec.T)]]
    df_full['PRED'] = y_p
    df_full['ACCURACY'] = df_full.apply(lambda x: x['PRED'] in [a[:3] for a in x['PAIR'].values()], axis=1)
    df_full['PRIMARY_ACCURACY'] =df_full.apply(lambda x: x['PRED'] == x['PAIR'][min(x['PAIR'].keys())][:3],axis=1)
    return df_full


##### Simple TF-IDF + Cosine similarity

Which of the three diagnosis-adjacent fields should we use? 

In [6]:
print(f"HISTORY has {len(BASE.dropna(subset=['HISTORY']))} non-null values")
print(f"DISCHARGE has {len(BASE.dropna(subset=['DISCHARGE']))} non-null values")
print(f"DISCHARGE_PRIMARY has {len(BASE.dropna(subset=['DISCHARGE_PRIMARY']))} non-null values")

HISTORY has 40645 non-null values
DISCHARGE has 30170 non-null values
DISCHARGE_PRIMARY has 2359 non-null values


History would be the most complete, discharge the lowest, and discharge primary with barely anything. But how predictive are they? Exploratory analysis with Cosine similarity

In [7]:
def print_accuracy(df):
    print("The model made the primary diagnosis {:.2f}% of the time and made a diagnosis {:.2f}% of the time".format(
        df['PRIMARY_ACCURACY'].mean()*100,df['ACCURACY'].mean()*100
    ))

In [8]:
note2 = BASE.copy()
import warnings
warnings.filterwarnings('ignore')

In [9]:
print('History')
res = model({'stop_words':SW},dig_2[0],note2,'HISTORY', 'HISTORY')
print_accuracy(res)
print('DISCHARGE')
res = model({'stop_words':SW},dig_2[0],note2,'DISCHARGE', 'DISCHARGE')
print_accuracy(res)
print('DISCHARGE_PRIMARY')
res = model({'stop_words':SW},dig_2[0],note2,'DISCHARGE_PRIMARY', 'DISCHARGE_PRIMARY')
print_accuracy(res)

History
The model made the primary diagnosis 3.76% of the time and made a diagnosis 13.36% of the time
DISCHARGE
The model made the primary diagnosis 10.89% of the time and made a diagnosis 35.11% of the time
DISCHARGE_PRIMARY
The model made the primary diagnosis 15.30% of the time and made a diagnosis 45.99% of the time


Unfortunately, the accuracy is the exact INVERSE of the availability of the data. DISCHARGE_PRIMARY is unusable bc too few, but maybe we can boost DISCHARGE by coalesce the field w DISCHARGE_PRIMARY

In [10]:
note2['DISCHARGE_COALESCE'] = note2['DISCHARGE_PRIMARY'].combine_first(note2['DISCHARGE'])
print('DISCHARGE_COALESCE')
res = model({'stop_words':SW},dig_2[0],note2,'DISCHARGE_COALESCE', 'DISCHARGE_COALESCE')
print_accuracy(res)

DISCHARGE_COALESCE
The model made the primary diagnosis 11.16% of the time and made a diagnosis 35.86% of the time


So we can see discharge_coalesce is the way to go
Can we increase performance by including bigrams? 


In [11]:
print('DISCHARGE_COALESCE - BIGRAMS')
res = model({'stop_words':SW, 'ngram_range':(1,2)},dig_2[0],note2,'DISCHARGE_COALESCE', 'DISCHARGE_COALESCE')
print_accuracy(res)

DISCHARGE_COALESCE - BIGRAMS
The model made the primary diagnosis 11.30% of the time and made a diagnosis 37.01% of the time


What about dig_12 which can make some of the shorter documents (some are just one word)
have other context like what the "super" diagnosis category means?

In [12]:
print('DISCHARGE_COALESCE - BIGRAMS + Context Diagnosis')
res = model({'stop_words':SW, 'ngram_range':(1,2)},dig_12[0],note2,'DISCHARGE_COALESCE', 'DISCHARGE_COALESCE')
print_accuracy(res)

DISCHARGE_COALESCE - BIGRAMS + Context Diagnosis
The model made the primary diagnosis 12.20% of the time and made a diagnosis 37.07% of the time


These unsupervised models will serve as a benchmark against a supervised model - we are trying to beat a primary diagnosis of 12.20\% and in-the-list diagnosis of 37.07\%