In [3]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords 
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
import nltk
import re 

## README
### Remark:
* as for manual comparison, in the case that multiple diseases occurs in one medical transcription, as long as one of them is recgonized correctly by the similarity matrix, it is considered as correct match. <br/> (i.e.: mt {Patient with a diagnosis of pancreatitis, developed hypotension and possible sepsis and respiratory, as well as renal failure.}; icd {Diseases of the respiratory system})
* the .csv file "allvalid2011.csv" was modified from the original version. Please find the cooresponding file attahced with the code
* the ICD type 21, 22 were dropped due to lack of sufficient description data. 

### Version log:
**version template: v3.1.0 **
* 3: Vectorization method 3
* 1: The second experimented threshold value
* 0: The first examined shffuled dataset <br/>

#### Version Desc
* v1: normal W2V process; take average as doc rep
* v2: lower the implication from high-frequency words (like "history") by subtracting the mean of 22 ICD vectors from each element in each ICD vector as well as each medical transcription vector ; still, take average as doc rep 
* v3: for POS-tag, also keep adj and verb
* v4: use tf-idf weighted measure to represent a doc

version|w2v|POS tag|doc rep
-|:-|:-|:-
v1|pretrained w2v model|nouns|mean value of word vectors
v2|pretrained w2v model|nouns|
v3|pretrained w2v model|nouns, verbs, adjectives|mean value of word vectors
v4|pretrained w2v model|nouns|tf-idf weighted measure


---
#### Result Log
version|median|mean|max|threshold|# after threshold|accuracy
-|-|-|-|-|-|-
v1.0.0|0.2796395575950238|0.2833969991805676|0.8583709831829136|0.5|3381|40%
v1.1.0||||0.6|1817|80%
v1.2.0||||0.7|639|80%
v2.0.0|-0.02174311680385281|-0.0012455847865164764|1.0|0.5|1135|70%
v2.0.1||||||70%
v3.0.0|0.3386405344879612|0.3407677889369111|0.8583447111956868|0.5|3814|50%
v3.0.1||||||50%
v3.1.0||||0.6|2294|50%
v4.0.0|0.2073973010080073|0.22690196166648854|0.9109624806359937|0.5|2435|70%
v4.0.1||||||60%
v4.1.0||||0.6|1030|100%




#### Code Version Instruction
* To implement version 1, 2, 3, run sections 1-2-3-5-6; please pay attention to comments saying "RUN THIS CELL ONLY WHEN ..." and modify the code according to the vesion you are implementing
* To implement version 4, run sections 1-4-5-6.

## 1 Text Aggregation

In [6]:
icd = pd.read_csv("./data/allvalid2011.csv")

In [40]:
# open files storing description strings
# only execute for initializing the process

txt = []
for i in range(20):
    i+=1
    file = open("./data/icd%d.txt" %i, "a+", encoding='UTF-8')
    txt.append(file)
#     file.close()

In [42]:
# classify each row of icd into 22 cates according to col type

for i in range(len(icd)):
#     print(icd['type'][i])
    type = icd['type'][i] # txt_index = type-1
    txt[type-1].write(icd["ICD Title"][i])

In [43]:
# close files
for i in range(22):
    txt[i].close()

## 2 Vectorization - ICD Descriptions

In [153]:
# RUN THIS CELL ONLY WHEN IMPLEMENTING VERSION 3
# to implement version 3, replace the current POS-tag filter with this list for both vectorization sections
listPos = ['NN', 'NNP', 'NNS', 'NNPS', 'JJ', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

In [7]:
model = KeyedVectors.load_word2vec_format('C:\PubMed-and-PMC-w2v.bin', binary=True)

In [87]:
icd_vec=np.empty((0, 200), float)

for i in range(20): 
    ite_name = i+1
    file = open("./data/icd%d.txt" %ite_name, "r")
    raw_data = file.read()
    file.close()

    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(raw_data)
    
    #convert all words to lower case
    word_tokens = [w.lower() for w in word_tokens] 
    #remove words in stopwords list
    word_tokens = [w.lower() for w in word_tokens if w not in stop_words] 
    #only keep alphabetic terms
    word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 
    
    token_pos=nltk.pos_tag(word_tokens)

    word2vec_ls=np.empty((0, 200), float)
    for word, pos in token_pos:
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
#         if (pos in listPos):
            try:
                word2vec_ls=np.vstack([word2vec_ls, model.word_vec(word)])
            except:
                pass

    if np.any(np.isnan(word2vec_ls)) or len(word2vec_ls)==0:
        desc_vec=np.zeros((1, 200))
    else:
        desc_vec = np.mean(word2vec_ls, axis=0).reshape(1, 200)
    icd_vec=np.vstack([icd_vec, desc_vec])

In [86]:
icd_vec.shape

(20, 200)

In [92]:
# RUN THIS CELL ONLY WHEN IMPLEMENTING VERSION 2

icd_vec_mean = np.mean(icd_vec, axis = 0)
icd_vec_new = np.empty((0,200),float)
for item in icd_vec:
    item = np.subtract(item, icd_vec_mean) # subtracting the mean value from document reps
    icd_vec_new = np.vstack([icd_vec_new, item])

icd_vec_new.shape

(22, 200)

In [157]:
# write icd vectors to a csv file
dfIcdV = pd.DataFrame(icd_vec)
dfIcdV.to_csv(path_or_buf = './data/IcdV_v3.csv', index=False)

## 3 Vectorization - Medical Transcriptions

In [4]:
mt = pd.read_csv("./data/medicaltranscriptions-1.csv")

In [160]:
description_vec=np.empty((0, 200), float)

for i in range(len(mt["description"])):
    word2vec_ls=np.empty((0, 200), float)
    mt_token_pos=nltk.pos_tag(nltk.word_tokenize(mt["description"][i]))
    # for each word in a case, convert t into vector using model
    # take the exception into consideration (i.e.: word is not in the model)
    for word, pos in mt_token_pos:
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
#         if (pos in listPos):
            try:
                word2vec_ls=np.vstack([word2vec_ls, model.word_vec(word)])
            except:
                pass
    # if the case is empty, convert the document vector into zeros, otherwise calculate the average value of word vectors
    if np.any(np.isnan(word2vec_ls)) or len(word2vec_ls)==0:
        desc_vec=np.zeros((1, 200))
    else:
        desc_vec = np.mean(word2vec_ls, axis=0).reshape(1, 200)
    # append the new document vector into the parent array
    description_vec=np.vstack([description_vec, desc_vec])

In [161]:
description_vec.shape

(4999, 200)

In [115]:
# RUN THIS CELL ONLY WHEN IMPLEMENTING VERSION 2

icd_vec_mean = np.mean(icd_vec, axis = 0)
desc_vec_new = np.empty((0,200),float)
for item in description_vec:
    item = np.subtract(item, icd_vec_mean)
    desc_vec_new = np.vstack([desc_vec_new, item])

desc_vec_new.shape

(4999, 200)

In [162]:
# write mt vector matrix to a csv file
dfDesV = pd.DataFrame(description_vec)
dfDesV.to_csv(path_or_buf = './data/DesV_v3.csv', index=False)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.107323,0.179122,0.147116,-0.064020,0.007943,-0.081266,0.027539,-0.003150,-0.127391,0.278302,...,0.054462,-0.059412,0.066633,-0.103957,-0.110595,-0.014091,0.231983,0.042359,-0.003940,-0.287913
1,-0.106285,0.064924,-0.109951,-0.148879,0.156490,-0.190221,-0.013110,-0.040781,-0.207537,0.161243,...,-0.046445,0.046741,0.008469,-0.236932,-0.119958,0.226861,-0.031890,-0.023429,-0.012719,-0.068181
2,-0.106285,0.064924,-0.109951,-0.148879,0.156490,-0.190221,-0.013110,-0.040781,-0.207537,0.161243,...,-0.046445,0.046741,0.008469,-0.236932,-0.119958,0.226861,-0.031890,-0.023429,-0.012719,-0.068181
3,-0.275537,0.273051,-0.350958,-0.063018,0.076781,-0.092217,0.056712,0.213214,0.297947,-0.235032,...,0.087096,-0.023042,-0.255889,0.044475,-0.017224,-0.147943,-0.109404,-0.106315,0.141747,-0.269607
4,0.024801,0.325037,-0.287396,0.014443,0.311813,-0.103446,0.034335,0.066452,0.122171,-0.166897,...,0.059265,-0.047493,-0.008037,-0.005760,0.122810,-0.074632,0.082274,0.012977,0.107769,-0.250130
5,-0.050463,0.108150,-0.017314,-0.058184,0.088634,-0.121709,0.020999,-0.028762,-0.110757,0.138270,...,0.113721,0.000248,0.018468,-0.095150,0.068646,0.014750,0.079423,0.161136,-0.229912,-0.072466
6,0.070051,0.163544,-0.068169,0.002374,0.070425,-0.122149,0.137190,-0.113452,-0.121725,0.221143,...,-0.053970,-0.081184,0.071499,-0.079434,0.019684,0.055188,0.084148,0.052534,0.172394,-0.118910
7,0.024801,0.325037,-0.287396,0.014443,0.311813,-0.103446,0.034335,0.066452,0.122171,-0.166897,...,0.059265,-0.047493,-0.008037,-0.005760,0.122810,-0.074632,0.082274,0.012977,0.107769,-0.250130
8,0.048351,0.261652,-0.108791,0.004016,0.106185,-0.162764,0.264707,-0.053007,-0.135607,0.417078,...,0.103555,0.011864,0.048534,0.003039,0.033244,0.108272,0.044231,-0.070172,0.026509,-0.015345
9,-0.165771,0.302159,-0.401926,0.090448,0.052564,-0.120847,-0.150021,0.203446,0.185956,-0.215108,...,0.071532,-0.008810,-0.142136,-0.106683,0.006676,-0.093788,-0.016801,-0.254040,0.110575,-0.463978


## 4 TF-IDF Weighted Measure (Version 4) 
** RUN THIS SECTION ONLY WHEN IMPLEMENTING VERSION 4 **

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
 

In [6]:
# construct the collection of all icd desc and 5000 mts

corpus = []

# extract icd desc
stop_words = set(stopwords.words('english'))
for i in range(20): 
    ite_name = i+1
    file = open("./data/icd%d.txt" %ite_name, "r", encoding='UTF-8')
    raw_data = file.read()
    file.close()
    corpus.append(raw_data)

# extract mt
for i in range(len(mt['description'])):
    corpus.append(mt['description'][i])

In [23]:
# generate the tf-idf weight matrix

# construct a count matrix and get the list of words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
listWord = vectorizer.get_feature_names()

# construct a tf-idf matrix
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)

# convert the output to an array for better usage
arrWeight = tfidf.toarray()

In [91]:
# convert icd desc and medical transcription to vectors
icd_vec=np.empty((0, 200), float)

for i in range(20): 
    ite_name = i+1
    file = open("./data/icd%d.txt" %ite_name, "r")
    raw_data = file.read()
    file.close()

    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(raw_data)
    
    #convert all words to lower case
    word_tokens = [w.lower() for w in word_tokens] 
    #remove words in stopwords list
    word_tokens = [w.lower() for w in word_tokens if w not in stop_words] 
    #only keep alphabetic terms
    word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 
    
    token_pos=nltk.pos_tag(word_tokens)

    word2vec_ls=np.empty((0, 200), float)
    weight_ls = []
    
    for word, pos in token_pos:
        if (pos in ['NN', 'NNP', 'NNS', 'NNPS'] and word in listWord):
            try:
                indWord = listWord.index(word)
                word2vec_ls=np.vstack([word2vec_ls, model.word_vec(word)])
                weight_ls.append(arrWeight[i][indWord])
            except:
                pass

    if np.any(np.isnan(word2vec_ls)) or len(word2vec_ls)==0:
        desc_vec=np.zeros((1, 200))
    else:
        desc_vec = np.average(word2vec_ls, axis=0, weights=weight_ls, returned=False)
    icd_vec=np.vstack([icd_vec, desc_vec])

In [92]:
description_vec=np.empty((0, 200), float)

for i in range(len(mt["description"])):
    word2vec_ls=np.empty((0, 200), float)
    weight_ls = []
    mt_token_pos=nltk.pos_tag(nltk.word_tokenize(mt["description"][i]))
    
    #convert all words to lower case
    word_tokens = [w.lower() for w in word_tokens] 
    #remove words in stopwords list
    word_tokens = [w.lower() for w in word_tokens if w not in stop_words] 
    #only keep alphabetic terms
    word_tokens = [w.lower() for w in word_tokens if w.isalpha()] 
    
    # for each word in a case, convert t into vector using model
    # take the exception into consideration (i.e.: word is not in the model)
    for word, pos in mt_token_pos:
        if (pos in ['NN', 'NNP', 'NNS', 'NNPS'] and word in listWord):
            try:
                indWord = listWord.index(word)
                word2vec_ls=np.vstack([word2vec_ls, model.word_vec(word)])
                weight_ls.append(arrWeight[i+20][indWord])
            except:
                pass
    # if the case is empty, convert the document vector into zeros, otherwise calculate the average value of word vectors
    if np.any(np.isnan(word2vec_ls)) or len(word2vec_ls)==0:
        desc_vec=np.zeros((1, 200))
    else:
        desc_vec = np.average(word2vec_ls, axis=0, weights=weight_ls, returned=False)
    # append the new document vector into the parent array
    description_vec=np.vstack([description_vec, desc_vec])

In [93]:
# write sim matrix to a csv file
dfIcdV = pd.DataFrame(icd_vec)
dfIcdV.to_csv(path_or_buf = './data/IcdV_v4.csv', index=False)

# write mt vector matrix to a csv file
dfDesV = pd.DataFrame(description_vec)
dfDesV.to_csv(path_or_buf = './data/DesV_v4.csv', index=False)

## 5 Similarity

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
sim = cosine_similarity(description_vec, icd_vec)

In [170]:
sim

array([[0.3553884 , 0.38406894, 0.36978634, ..., 0.43369216, 0.35759767,
        0.31261114],
       [0.07466101, 0.18418707, 0.05656946, ..., 0.13179793, 0.24739479,
        0.09449073],
       [0.07466101, 0.18418707, 0.05656946, ..., 0.13179793, 0.24739479,
        0.09449073],
       ...,
       [0.36978281, 0.22404419, 0.33582207, ..., 0.39386017, 0.28608588,
        0.28919567],
       [0.36644213, 0.20221795, 0.23334095, ..., 0.41779517, 0.2854135 ,
        0.28181405],
       [0.55487984, 0.50869265, 0.46281272, ..., 0.5107827 , 0.35825746,
        0.25132562]])

In [17]:
# calculate the mean, median value and maximum value as reference to determine a threshold 
print(np.median(sim))
print(np.mean(sim))
print(np.max(sim))

0.2073973010080073
0.22690196166648854
0.9109624806359938


In [18]:
# construct a list of ICD titles
dictIcd = {
    0: 'Certain infectious and parasitic diseases',
    1: 'Neoplasms',
    2: 'Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism',
    3: 'Endocrine, nutritional and metabolic diseases',
    4: 'Mental and behavioural disorders',
    5: 'Diseases of the nervous system',
    6: 'Diseases of the eye and adnexa',
    7: 'Diseases of the ear and mastoid process',
    8: 'Diseases of the circulatory system',
    9: 'Diseases of the respiratory system',
    10:'Diseases of the digestive system',
    11:'Diseases of the skin and subcutaneous tissue',
    12:'Diseases of the musculoskeletal system and connective tissue',
    13:'Diseases of the genitourinary system',
    14:'Pregnancy, childbirth and the puerperium',
    15:'Certain conditions originating in the perinatal period',
    16:'Congenital malformations, deformations and chromosomal abnormalities',
    17:'Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified',
    18:'Injury, poisoning and certain other consequences of external causes',
    19:'External causes of morbidity and mortality',
    20:'Factors influencing health status and contact with health services',
    21:'Codes for special purposes'
}

In [20]:
icd_match = np.empty((0,1), )
icd_sim = np.empty((0,1),)
for i in range(len(sim)):
    index, =np.where(sim[i] == np.amax(sim[i]))
    simMax = np.amax(sim[i])
    if sim[i,index[0]] > 0.6:
        icd_match = np.vstack([icd_match, dictIcd[index[0]]])
        icd_sim = np.vstack([icd_sim, simMax])
    else: 
        icd_match = np.vstack([icd_match, 'No type matched'])
        icd_sim = np.vstack([icd_sim, simMax])

In [21]:
# print the data volume after filtering using the threshold
len(np.where(icd_match != "No type matched")[0])

1030

In [27]:
icd_match

array([['Diseases of the skin and subcutaneous tissue'],
       ['No type matched'],
       ['No type matched'],
       ...,
       ['Diseases of the respiratory system'],
       ['No type matched'],
       ['Diseases of the respiratory system']], dtype='<U99')

In [24]:
from sklearn.utils import shuffle
dfMatch = pd.DataFrame(icd_match, columns = ['type'])
dfMatch['desc'] = mt['description']
dfMatch['sim'] = icd_sim

In [25]:
dfMatch

Unnamed: 0,type,desc,sim
0,No type matched,A 23-year-old white female presents with comp...,0.553067
1,No type matched,Consult for laparoscopic gastric bypass.,0.419301
2,No type matched,Consult for laparoscopic gastric bypass.,0.419301
3,No type matched,2-D M-Mode. Doppler.,0.000000
4,No type matched,2-D Echocardiogram,0.000000
5,No type matched,Morbid obesity. Laparoscopic antecolic anteg...,0.425851
6,No type matched,"Liposuction of the supraumbilical abdomen, re...",0.528354
7,No type matched,2-D Echocardiogram,0.000000
8,No type matched,Suction-assisted lipectomy - lipodystrophy of...,0.499652
9,No type matched,Echocardiogram and Doppler,0.000000


In [26]:
dfMatchShfl = shuffle(dfMatch[dfMatch['type']!="No type matched"])

In [27]:
print(dfMatchShfl[0:10])

                                                   type  \
1245  Diseases of the musculoskeletal system and con...   
398                  Diseases of the circulatory system   
3922                   Diseases of the digestive system   
4860                 Diseases of the circulatory system   
4148  Injury, poisoning and certain other consequenc...   
4099            Diseases of the ear and mastoid process   
1009                   Diseases of the digestive system   
4904                 Diseases of the circulatory system   
2570  Certain conditions originating in the perinata...   
1350               Diseases of the genitourinary system   

                                                   desc       sim  
1245   Anterior cervical discectomy fusion C3-C4 and...  0.648441  
398    Pulmonary valve stenosis, supple pulmonic nar...  0.797496  
3922   Acute gastroenteritis, resolved.  Gastrointes...  0.774666  
4860   A 10-1/2-year-old born with asplenia syndrome...  0.703149  
4148   Rec

In [30]:
# write the randomly selected ten matches for accuracy score calculation
dfMatchShfl[0:10].to_csv(path_or_buf = './data/accuracy_v4.1.0.csv', index = False)

## 6 Finalization

In [106]:
# write sim matrix to a csv file
dfSim = pd.DataFrame(sim)
dfSim['desc'] = mt["description"]
dfSim.to_csv(path_or_buf = './data/sim_v4.csv', index=False)

In [29]:
# write mt-type match to a csv file
dfMatch.to_csv(path_or_buf = './data/match_v4.1.csv', index = False)