### Medical Records Classification

##### How to use
- Run all cells till Feature Vector section
- Run any one feature vector to get test_vectors and doc_vectors along with pairwise dist calculations
- For clustering, go the Clustering Analysis section, and run any one algorithm
- Tune parameters as needed

In [1]:
import pandas as pd
import numpy as np

import simple_icd_10_cm as icd

import spacy
import re

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.cluster import KMeans,AgglomerativeClustering,AffinityPropagation,Birch
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

from scipy.spatial.distance import pdist
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import TruncatedSVD

import gensim.models
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from gensim.models.keyedvectors import KeyedVectors
import gensim.downloader as api

import collections
from IPython.display import clear_output, display

from nltk.stem import WordNetLemmatizer

### Common Functions

In [2]:
nlp = spacy.load("en_core_web_md")
all_stopwords = nlp.Defaults.stop_words

# Pass a single HOPI record
# Returns processed text
def transformText(text):
    doc = nlp(text)
    sent = list()
    for token in doc:
        w = token.text.lower()
        if w in med_dict.keys():
            w = med_dict[w]
        w = re.sub('[^A-Za-z\s]+', '', w)
        if not(token.is_space) and not(token.is_punct):
            sent.append(w)
            
    sent= [word for word in sent if not word in all_stopwords]
    return " ".join(sent)

In [3]:
# Takes comma separated string of ICD-10 codes as input
# Returns comma separated ICD-10 codes string for specified level
# index = 1 for Level 1
# index = 2 for Level 2
# index = 3 for Level 3

def getAncestors(x,index):
    codes = x.split(',')
    temp = list()
    for c in codes:
        i = c.strip()
        while(len(i) > 0 and not(icd.is_valid_item(i))):
            i = i[:-1]
        
        if len(i) == 0:
            continue 
        if len(icd.get_ancestors(i)) < index:
            temp.append(i)
        else:
            i = icd.get_ancestors(i)[-index]
        temp.append(i)
    
    
    return ",".join(temp)

In [53]:
# Takes array of actual and predicted labels as input
# Returns tuple of mean recall, precision and f-score 
# Set mode parameter other than mean to evaluate a single record
def getScores(actual,predicted,mode='mean'):
    if mode == 'mean':
        recall = []
        precision = []
        f_score = []

        for i in range(actual.shape[0]):
            s1 = set(actual[i].split(","))
            s2 = set(predicted[i].split(","))

            inter = len(s1.intersection(s2))
            r = inter/len(s1)
            recall.append(r)
            p = inter/len(s2)
            precision.append(p)

            try:
                f_score.append(2 * r * p/(r + p))
            except:
                f_score.append(0)
            
        test_df['recall'] = recall
        test_df['precision'] = precision
        test_df['f_score'] = f_score

        avg_recall = test_df['recall'].mean()
        avg_prec = test_df['precision'].mean()
        avg_f_score = test_df['f_score'].mean()
        
        return (avg_recall,avg_prec,avg_f_score)
    else:
        s1 = set(actual.split(","))
        s2 = set(predicted.split(","))

        inter = len(s1.intersection(s2))
        r = inter/len(s1)
        p = inter/len(s2)
        f_score = None
        try:
            f_score = 2 * r * p/(r + p)
        except:
            f_score = 0
            
        return (r,p,f_score)

### Approach 1

### Approach 2

#### Load medical terms dictionary

In [61]:
med_dict = pd.read_excel(r'Data\medicalTermsDictionary (1).xlsx')
med_dict = dict(zip(med_dict.Abbreviation, med_dict.Term))

##### Prepare ICD-10 Dataframe

In [62]:
# Create Dataframe from ICD 10 codes
all_codes = icd.get_all_codes(with_dots=True)
code_df = pd.DataFrame(all_codes,columns=['code'])

In [63]:
# Get description, descendants and ancestors for each code
code_df['description'] = code_df['code'].apply(lambda x: icd.get_description(x))
code_df['ancestor'] = code_df['code'].apply(lambda x: icd.get_ancestors(x))
code_df['descendants'] = code_df['code'].apply(lambda x: icd.get_descendants(x))

In [64]:
# Change value of 'l' for different levels
# l = 0 for Level 1
# l = 1 for Level 2
# l = 2 for Level 3

level=0
filtered = code_df[code_df.apply(lambda x: len(x['ancestor']),axis=1) == level]
filtered = filtered.drop_duplicates(['description'])
filtered = filtered.reset_index(drop=True)

In [65]:
# Run this code if you want to expand the description to include descendants' description
for c in filtered.iterrows():
    desc = [filtered.loc[c[0]].description]
    for d in c[1]['descendants']:
        desc.append(icd.get_description(d))
    filtered.loc[c[0]].description = " ".join(desc)

#### Load preprocessed data for 20,000 medical records

In [9]:
df = pd.read_excel(r'Data/data.xlsx')
df.head()

Unnamed: 0.1,Unnamed: 0,ER_NO,AGE_YEARS,TRIAGECOMPLAINT,HOPI_,ED_DX,Code,HOPI_2,Unnamed: 8,Unnamed: 9,HOPI_modified
0,,E001589137,85.0011,Unwell adult,fever since yesterday gernalised body ache\n\n...,AGE,A09,fever since yesterday gernalised body ache\n\n...,,,fever yesterday gernalised body ache loose sto...
1,,E001604784,81.9989,Unwell adult,"PT IS A 82 YO MALE\n\n\n\nKC OF IHD, EF 25%\n\...",FEVER,R50.9,"PT IS A 82 YO MALE\n\n\n\nKC OF IHD, EF 25%\n\...",,,patient yo male known case ischemic heart dis...
2,,E001642697,89.9999,"Falls,Limb problems",90 year old\n\nhypertensive (non compliant)\n\...,RTA,"V89.2XXA, Y92.488",90 year old\n\nhypertensive (non compliant)\n\...,,,year old hypertensive non compliant history f...
3,,E001464239,94.3118,Abdominal pain in adults,"K/C HTN taking atenolol, compliant\n\npresente...",UTI,N39.0,"K/C HTN taking atenolol, compliant\n\npresente...",,,k c hypertension taking atenolol compliant pre...
4,,E001531532,91.0108,"Falls,Head injury","91 yr old female \nk/c HTN , DM\n\ncame e c/o\...",trauma,S09.90XA,"91 yr old female \nk/c HTN , DM\n\ncame e c/o\...",,,yr old female k c hypertension diabetes melli...


##### Load and Preprocess Golden Dataset

In [66]:
test_df = pd.read_excel(r'Data\Sample_HOPI.xlsx')

In [67]:
modified = test_df['hopi_'].apply(lambda x: transformText(x))
test_df['HOPI_modified'] = modified

In [68]:
test_df = test_df[test_df['HOPI_modified'].notna()]
test_df = test_df[test_df['code'].notna()]
test_df = test_df.reset_index()
test_df.head()

Unnamed: 0,index,mr_code,er_no,age_years,triagecomplaint,hopi_,ed_dx,code,h,HOPI_modified
0,0,10190001490,E001400336,32.0019,Unwell adult,32YR OLD M \n\nC/O--FEVER--SINCE YESTERDAY \nV...,RTI/CONSTIPATION,"R50.9, R11.10, R52, M54.9, R05.9, K59.00",32YR OLD M \n\nC/O--FEVER--SINCE YESTERDAY \nV...,yr old m c o fever yesterday vomiting episode ...
1,1,86180009581,E001404472,40.1911,Pregnancy,g6p4+1 ( 2nd marriage) svd\n\ncurrent preg i...,31 wk preg came in er with generalised body w...,"O09.293, Z3A.31, O99.891, R53.1",g6p4+1 ( 2nd marriage) svd\n\ncurrent preg i...,gp nd marriage spontaneous vaginal delivery ...
2,2,10190005384,E001404582,90.0862,Unwell adult,nkcm \nhx of cva several years \nlft sided wea...,infected bed sore / htn,"I69.398, M62.81, L89.321, L89.150",nkcm \nhx of cva several years \nlft sided wea...,No known comorbidities history cerebral vascul...
3,3,10120025208,E001405244,69.5342,Urinary problems,nkcm \nh/o turp - 1 year back \nradical cystec...,bleeding per urethra,"Z85.51, N36.8",nkcm \nh/o turp - 1 year back \nradical cystec...,No known comorbidities h o turp year radical ...
4,4,10190006313,E001405573,70.0006,Limb problems,"kc of HTN, on oral meds, compliant\n\nhx of fa...",PUBIC RAMUS FRACTURE,"I10, W01.0XXA","kc of HTN, on oral meds, compliant\n\nhx of fa...",known case hypertension oral meds compliant hi...


##### Transform codes

In [70]:
test_df['code_modified'] = test_df['code'].apply(lambda x: getAncestors(x,level+1))
test_df.head()

Unnamed: 0,index,mr_code,er_no,age_years,triagecomplaint,hopi_,ed_dx,code,h,HOPI_modified,code_modified
0,0,10190001490,E001400336,32.0019,Unwell adult,32YR OLD M \n\nC/O--FEVER--SINCE YESTERDAY \nV...,RTI/CONSTIPATION,"R50.9, R11.10, R52, M54.9, R05.9, K59.00",32YR OLD M \n\nC/O--FEVER--SINCE YESTERDAY \nV...,yr old m c o fever yesterday vomiting episode ...,181818131811
1,1,86180009581,E001404472,40.1911,Pregnancy,g6p4+1 ( 2nd marriage) svd\n\ncurrent preg i...,31 wk preg came in er with generalised body w...,"O09.293, Z3A.31, O99.891, R53.1",g6p4+1 ( 2nd marriage) svd\n\ncurrent preg i...,gp nd marriage spontaneous vaginal delivery ...,15211518
2,2,10190005384,E001404582,90.0862,Unwell adult,nkcm \nhx of cva several years \nlft sided wea...,infected bed sore / htn,"I69.398, M62.81, L89.321, L89.150",nkcm \nhx of cva several years \nlft sided wea...,No known comorbidities history cerebral vascul...,9131212
3,3,10120025208,E001405244,69.5342,Urinary problems,nkcm \nh/o turp - 1 year back \nradical cystec...,bleeding per urethra,"Z85.51, N36.8",nkcm \nh/o turp - 1 year back \nradical cystec...,No known comorbidities h o turp year radical ...,2114
4,4,10190006313,E001405573,70.0006,Limb problems,"kc of HTN, on oral meds, compliant\n\nhx of fa...",PUBIC RAMUS FRACTURE,"I10, W01.0XXA","kc of HTN, on oral meds, compliant\n\nhx of fa...",known case hypertension oral meds compliant hi...,920


##### Create Feature Vector

In [71]:
train_data = pd.concat([df['HOPI_modified'],filtered['description']])

In [72]:
vectorizer = TfidfVectorizer(stop_words="english")
vectorizer.fit(train_data)

TfidfVectorizer(stop_words='english')

In [73]:
test_vectors = vectorizer.transform(filtered['description']).toarray()

In [74]:
doc_vectors = vectorizer.transform(test_df['HOPI_modified']).toarray()

In [75]:
doc_vectors = csr_matrix(doc_vectors)
tsvd = TruncatedSVD(n_components=5)
_ = tsvd.fit(doc_vectors)

doc_vectors = tsvd.transform(doc_vectors)
test_vectors = tsvd.transform(csr_matrix(test_vectors))

##### Predict

In [76]:
dist = pairwise_distances(doc_vectors,test_vectors,metric='euclidean')

In [77]:
def get_labels(i):
    n=2 # suggested labels count
    nearest = list(zip(np.sort(dist)[i,:n],np.argsort(dist)[i,:n]))


    temp = list()
    for j in nearest:
        code = filtered['code'][j[1]]
        temp.append(code)
        # print(code, icd.get_description(code),"Score:",j[0])
    
    return ",".join(temp)

test_df['predicted'] = list(map(get_labels,list(range(0,test_df.shape[0]))))

In [78]:
avg_recall,avg_prec,avg_f_score = getScores(test_df['code_modified'],test_df['predicted'])
print("Recall: {}\nPrecision: {}\nF-Score: {}".format(avg_recall,avg_prec,avg_f_score))

Recall: 0.4448250728862974
Precision: 0.5025510204081632
F-Score: 0.4422942986718499


In [80]:
i=20 # Record index
n=2 # Suggested labels count
nearest = list(zip(np.sort(dist)[i,:n],np.argsort(dist)[i,:n]))


print('HOPI: ',test_df['hopi_'][i].replace("\n"," "),end="\n\n")
print("Actual Codes: ",test_df["code_modified"][i],end="\n\n")

for j in nearest:
    code = filtered['code'][j[1]]
    print("{} - {} ({})".format(code,icd.get_description(code),round(j[0],2)))
    
r,p,f_score = getScores(test_df['code_modified'][i],test_df['predicted'][i],mode=None)  
print("\n\nPrecision: {}\nRecall: {}\nf_score: {}".format(p,r,f_score))


Actual Codes:  15,21,15

15 - Pregnancy, childbirth and the puerperium (O00-O9A) (0.4)
14 - Diseases of the genitourinary system (N00-N99) (0.4)


Precision: 0.5
Recall: 0.5
f_score: 0.5
