### Medical Records Classification

##### How to use
- Run all cells till Feature Vector section
- Run any one feature vector to get test_vectors and doc_vectors along with pairwise dist calculations
- For clustering, go the Clustering Analysis section, and run any one algorithm
- Tune parameters as needed

In [18]:
import pandas as pd
import numpy as np

import simple_icd_10_cm as icd

import spacy
import re

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.cluster import KMeans,AgglomerativeClustering,AffinityPropagation,Birch
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

from scipy.spatial.distance import pdist
from scipy.sparse import csr_matrix
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import TruncatedSVD

import gensim.models
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument,Doc2Vec
from gensim.models.keyedvectors import KeyedVectors
import gensim.downloader as api

import collections
from IPython.display import clear_output, display

from nltk.stem import WordNetLemmatizer

In [None]:
df = pd.read_excel(r'Data\ERSampleforCoding_150322.xlsx')
med_dict = pd.read_excel(r'Data\medicalTermsDictionary (1).xlsx')
med_dict = dict(zip(med_dict.Abbreviation, med_dict.Term))

In [None]:
df.head()

##### Get all codes

In [None]:
all_codes = icd.get_all_codes(with_dots=True)

In [None]:
code_df = pd.DataFrame(all_codes,columns=['code'])

In [None]:
code_df['description'] = code_df['code'].apply(lambda x: icd.get_description(x))
code_df['ancestor'] = code_df['code'].apply(lambda x: icd.get_ancestors(x))
code_df['descendants'] = code_df['code'].apply(lambda x: icd.get_descendants(x))

In [None]:
filtered = code_df[code_df.apply(lambda x: len(x['ancestor']),axis=1) == 0]
filtered = filtered.drop_duplicates(['description'])
filtered = filtered.reset_index(drop=True)

In [None]:
filtered.shape

In [None]:
for c in filtered.iterrows():
    desc = [filtered.loc[c[0]].description]
    for d in c[1]['descendants']:
        desc.append(icd.get_description(d))
    filtered.loc[c[0]].description = " ".join(desc)

#### Load Data

In [None]:
df = pd.read_excel(r'Data/data.xlsx')
df.head()

##### Golden Dataset

##### Data Preprocessing

In [None]:
test_df = pd.read_excel(r'Data\Sample_HOPI.xlsx')

In [None]:
nlp = spacy.load("en_core_web_md")
all_stopwords = nlp.Defaults.stop_words

In [None]:
def transformText(text):
    doc = nlp(text)
    sent = list()
    for token in doc:
        w = token.text.lower()
        if w in med_dict.keys():
            w = med_dict[w]
        w = re.sub('[^A-Za-z\s]+', '', w)
        if not(token.is_space) and not(token.is_punct):
            sent.append(w)
            
    sent= [word for word in sent if not word in all_stopwords]
    return " ".join(sent)

modified = test_df['hopi_'].apply(lambda x: transformText(x))
test_df['HOPI_modified'] = modified

In [None]:
test_df = test_df[test_df['HOPI_modified'].notna()]
test_df = test_df[test_df['code'].notna()]
test_df = test_df.reset_index()

In [None]:
# test_df.to_csv('Data/test_df.csv',index=False)
test_df.head()

##### Transform codes

In [None]:
def getAncestors(x):
    index = 3
    codes = x.split(',')
    temp = list()
    for c in codes:
        i = c.strip()
        while(len(i) > 0 and not(icd.is_valid_item(i))):
            i = i[:-1]
        
        if len(i) == 0:
            continue 
        if len(icd.get_ancestors(i)) < index:
            temp.append(i)
        else:
            i = icd.get_ancestors(i)[-index]
        temp.append(i)
    
    
    return ",".join(temp)

In [None]:
getAncestors('R50.9, R11.10, R52, M54.9, R05.9, K59.00')

In [None]:
test_df['code_modified'] = test_df['code'].apply(lambda x: getAncestors(x))
test_df.head()

##### Best Approach

In [None]:
train_data = pd.concat([df['HOPI_modified'],filtered['description']])

In [None]:
vectorizer = TfidfVectorizer(stop_words="english")
vectorizer.fit(train_data)

In [None]:
test_vectors = vectorizer.transform(filtered['description']).toarray()

In [None]:
doc_vectors = vectorizer.transform(test_df['HOPI_modified']).toarray()

In [None]:
doc_vectors = csr_matrix(doc_vectors)
tsvd = TruncatedSVD(n_components=10)
_ = tsvd.fit(doc_vectors)

doc_vectors = tsvd.transform(doc_vectors)
test_vectors = tsvd.transform(csr_matrix(test_vectors))

##### Average Word2Vec Approach

In [None]:
class MyCorpus:
    """An iterator that yields sentences (lists of str)."""
    def __init__(self,data):
        self.data = data
        
    def __iter__(self):
        for line in self.data:
            # assume there's one document per line, tokens separated by whitespace
            yield line.split(" ")
            
vec_size = 20

In [None]:
model = gensim.models.Word2Vec(sentences=MyCorpus(pd.concat([df['HOPI_modified'],filtered['description']])),
                               window=15,
                               vector_size=vec_size,
                               min_count=5,
                               epochs=100)

In [None]:
def getDocVec(doc):
    tokens = nlp(doc)
    vec = np.zeros(vec_size)
    count = 1e-5
    for t in tokens:
        if not(t.is_space) and model.wv.has_index_for(t.text):
            count += 1
            vec += model.wv[t.text]
    
    return vec/count

In [None]:
test_vectors = np.zeros((vec_size,len(filtered['description'])))
for n,d in enumerate(filtered['description']):
    test_vectors[:,n] = getDocVec(d)
    
test_vectors = test_vectors.T

doc_vectors = np.zeros((vec_size,len(test_df['HOPI_modified'])))
for n,d in enumerate(test_df['HOPI_modified']):
    doc_vectors[:,n] = getDocVec(d)
    
doc_vectors = doc_vectors.T

##### PmC Word2Vec

In [None]:
#download from http://evexdb.org/pmresources/vec-space-models/
model = KeyedVectors.load_word2vec_format('F:\PmC-w2v.bin', binary=True)

In [None]:
def getDocVec(doc):
    tokens = nlp(doc)
    vec = np.zeros(model.vector_size)
    count = 1e-5
    for t in tokens:
        if not(t.is_space) and model.has_index_for(t.text):
            count += 1
            vec += model[t.text]
    
    return vec/count

nlp = spacy.load("en_core_web_md")

In [None]:
doc_vectors = np.zeros((model.vector_size,len(test_df['HOPI_modified'])))
for n,d in enumerate(test_df['HOPI_modified']):
    doc_vectors[:,n] = getDocVec(d)
    
doc_vectors = doc_vectors.T

test_vectors = np.zeros((model.vector_size,len(filtered['description'])))
for n,d in enumerate(filtered['description']):
    test_vectors[:,n] = getDocVec(d)
    
test_vectors = test_vectors.T

##### Predict

In [None]:
dist = pairwise_distances(doc_vectors,test_vectors,metric='euclidean')

In [None]:
def get_labels(i):
    # selected = np.sort(dist[i,:])
    # threshold = selected[0] + (0.1*selected[0]) #0.025,0.025,
    # selected = selected[selected < threshold]
    # nearest = list(zip(selected,np.argsort(dist)[i,:(selected.shape[0])]))
    
    n=8
    nearest = list(zip(np.sort(dist)[i,:n],np.argsort(dist)[i,:n]))


    temp = list()
    for j in nearest:
        code = filtered['code'][j[1]]
        temp.append(code)
        # print(code, icd.get_description(code),"Score:",j[0])
    
    return ",".join(temp)

test_df['predicted'] = list(map(get_labels,list(range(0,test_df.shape[0]))))

In [None]:
recall = []
precision = []
f_score = []

for i in range(test_df.shape[0]):
    s1 = set(test_df['code_modified'][i].split(","))
    s2 = set(test_df['predicted'][i].split(","))

    inter = len(s1.intersection(s2))
    r = inter/len(s1)
    recall.append(r)
    p = inter/len(s2)
    precision.append(p)

    try:
        f_score.append(2 * r * p/(r + p))
    except:
        f_score.append(0)
    
test_df['recall'] = recall
test_df['precision'] = precision
test_df['f_score'] = f_score


In [None]:
avg_recall = test_df['recall'].mean()
avg_prec = test_df['precision'].mean()
avg_f_score = test_df['f_score'].mean()
print("Recall: {}\nPrecision: {}\nF-Score: {}".format(avg_recall,avg_prec,avg_f_score))

In [None]:
test_df['predicted'].apply(lambda x: len(x.split(","))).mean()

In [None]:
i=0 #0,97,186
# selected = np.sort(dist[i,:])
# threshold = selected[0] + 0.01 #0.025
# selected = selected[selected < threshold]
# nearest = list(zip(selected,np.argsort(dist)[i,:(selected.shape[0])]))

n=8
nearest = list(zip(np.sort(dist)[i,:n],np.argsort(dist)[i,:n]))


print('HOPI: ',test_df['hopi_'][i].replace("\n"," "),end="\n\n")
print("Actual Code",test_df["code_modified"][i],end="\n\n")

for j in nearest:
    code = filtered['code'][j[1]]
    print("{} - {} ({})".format(code,icd.get_description(code),round(j[0],2)))

In [None]:
s1 = set(test_df['code_modified'][i].split(","))
s2 = set(test_df['predicted'][i].split(","))

inter = len(s1.intersection(s2))
r = inter/len(s1)
p = inter/len(s2)
f_score = 0
try:
    f_score= 2 * r * p/(r + p)
except:
    f_score = 0
    
print("Precision: {}\nRecall: {}\nf_score: {}".format(p,r,f_score))

##### Sumaira's Results

In [None]:
results = [['R68', 'R11', 'K59', 'R10', 'M54', 'R05'],
 ['R53'],
 ['T79', 'I64', 'R26', 'L89', 'R53'],
 [],
 ['R52', 'W01'],
 ['R20', 'R19', 'K92', 'R42'],
 ['R52', 'R10', 'R31', 'K59'],
 [],
 ['R11', 'K59', 'R42'],
 ['E66'],
 ['R11', 'K59'],
 ['I38', 'I46', 'H18', 'R09', 'R99', 'R10'],
 ['R22', 'B99', 'R19'],
 ['R06', 'R11', 'R10', 'A16'],
 ['R06', 'A16', 'K80', 'R09', 'I21', 'K75', 'R05'],
 [],
 ['N25', 'G82', 'S19', 'R19'],
 ['R06', 'R07', 'A16', 'T14', 'R10'],
 ['R11', 'R19'],
 ['R30', 'R19', 'N48', 'R36', 'N34'],
 ['N92', 'R19'],
 ['N25', 'A35', 'S92'],
 ['R19', 'K92', 'S09', 'R41'],
 ['R52', 'T65', 'R20', 'R19'],
 ['N05', 'K51', 'M54'],
 ['M47', 'K59', 'T14', 'R52', 'R40', 'M54'],
 ['J34', 'R04', 'T14', 'R52', 'Z72', 'A35'],
 ['N25', 'R04', 'A35'],
 ['R06', 'A16', 'R19', 'R55', 'I48', 'R11', 'R07', 'T14', 'I95', 'S00'],
 ['R63', 'R56', 'R19'],
 ['R19'],
 ['R10', 'R31', 'R19'],
 ['R52', 'M25', 'A16'],
 ['R06', 'A16', 'R19', 'E04', 'R27'],
 ['L29', 'L81', 'M17'],
 ['M25', 'R06', 'R11', 'R07', 'R10', 'R05'],
 ['R22', 'K64', 'K59', 'K62', 'E14'],
 ['R07', 'T14', 'R52', 'S72', 'W01'],
 ['R00', 'R11', 'R19', 'R10', 'A09'],
 ['L29', 'R19'],
 ['R82', 'R39', 'M54'],
 ['T14', 'S09', 'S80', 'R52', 'S02'],
 ['Z71', 'K59'],
 ['N25'],
 ['N32', 'N20', 'R39', 'N13'],
 ['E87', 'T14', 'E03', 'R31', 'R19'],
 ['R04', 'R68', 'R11', 'R19', 'E14'],
 ['R11', 'N13', 'R19', 'R52', 'R33', 'R53'],
 ['R22', 'N40', 'K80', 'R53', 'E14', 'E87', 'R68', 'R11', 'K59', 'A41'],
 ['N25', 'T14'],
 ['R51', 'R11', 'R50', 'R19', 'R42', 'R10'],
 ['R19', 'M43', 'R52', 'R26'],
 ['N93'],
 ['A97', 'R00', 'R05'],
 ['R68', 'R06', 'R50', 'K59', 'R19', 'J45'],
 ['R06', 'I46', 'R09', 'R99'],
 ['R10', 'R19'],
 ['R07', 'E87', 'R39'],
 ['N25', 'M86', 'M79'],
 ['K40'],
 ['R17', 'L03', 'R23', 'R19', 'M79', 'E14', 'E87', 'T14'],
 ['R06', 'R31', 'R19', 'A16', 'K92', 'R53', 'N93', 'D64'],
 ['R68', 'R30'],
 ['R52', 'R06', 'J45'],
 ['R11', 'R30', 'K59', 'R73', 'E14'],
 ['R06', 'R11', 'R07', 'R19', 'R09', 'R10', 'R34'],
 ['Z86', 'A16', 'R19', 'R53', 'E14'],
 ['R06', 'N40', 'I25', 'R19', 'R20', 'R52', 'T00'],
 ['R06', 'R11', 'R50', 'J45', 'R05'],
 ['N25', 'R52'],
 ['B99', 'K05'],
 ['R05', 'Z71', 'R19'],
 ['R11', 'R14', 'R19', 'K59', 'R10', 'E14'],
 ['M25', 'R06', 'R07', 'I25', 'R42'],
 ['R53'],
 ['R46', 'R69', 'R19', 'R40'],
 ['R52', 'N25'],
 [],
 ['R52', 'R29', 'R32', 'R19'],
 ['R10', 'R19'],
 ['R52', 'R06', 'Z92'],
 ['R52'],
 ['R52', 'K59', 'R19'],
 ['L02'],
 ['I48', 'R06', 'R11', 'R07', 'A16', 'T14', 'K92', 'R10'],
 ['Y09', 'S09', 'R19', 'A35'],
 ['L29', 'Z91'],
 ['N89'],
 ['R30'],
 ['R11', 'R53', 'R10'],
 ['R06', 'R07', 'T14', 'R52', 'M54'],
 ['N63', 'R53'],
 ['N25', 'S02', 'T14'],
 ['T14', 'R19', 'M79', 'S72', 'W01'],
 ['R22', 'R52', 'R11'],
 ['O03', 'N93'],
 ['R06', 'R68', 'R07', 'R19', 'R42', 'R53'],
 ['Y09', 'T01', 'R19', 'S49', 'M54', 'S42', 'R60', 'T14', 'A35'],
 ['R19', 'R52', 'K92', 'E14', 'K75', 'R18', 'R07', 'K59', 'K62', 'B18', 'R10'],
 ['R10', 'K59', 'R41'],
 ['R06', 'E03', 'R07', 'R19', 'R09'],
 ['N25', 'S09', 'S22', 'R52', 'B18', 'A35'],
 ['R19', 'K12', 'L29', 'A97', 'R57', 'R11', 'R50', 'R16', 'K59', 'B18'],
 [],
 ['R11', 'R50', 'K59', 'R19', 'B18', 'K75'],
 ['J03'],
 ['E87', 'R11', 'R10', 'D64'],
 ['T12', 'R09', 'R06', 'E87'],
 ['R52', 'R10'],
 ['O36', 'B37', 'N89'],
 ['R99'],
 ['N25', 'W01'],
 ['I44', 'E87', 'R68', 'R19', 'R53', 'R05'],
 ['A41', 'R63', 'R11'],
 ['R10'],
 ['N89', 'R42'],
 ['A16', 'R19', 'A09', 'E86', 'R63', 'R11', 'R07', 'K59', 'R40', 'R10'],
 ['R06', 'A16', 'R19', 'J45', 'M54'],
 ['N95', 'R06', 'R23', 'R07', 'D50', 'E14'],
 ['B83', 'E14'],
 ['R06'],
 ['R07'],
 ['R06', 'I25', 'R19', 'R09', 'Z86', 'R11', 'I50', 'R07', 'R00', 'B18', 'R05'],
 ['R00', 'R53'],
 ['R22', 'R06', 'A16', 'X51', 'R05'],
 ['R52', 'L98', 'M19', 'T14'],
 ['Z86', 'A16', 'R19', 'N92', 'R10', 'K76'],
 ['R06', 'R20', 'I63', 'R09', 'R05'],
 ['H93', 'R42'],
 ['R50', 'J45', 'R10', 'M54'],
 ['R06', 'A16', 'X51', 'Z86', 'I48', 'R11', 'R07', 'K29', 'T14', 'R10', 'K76'],
 ['K40', 'K59', 'R19'],
 ['S52', 'W10', 'T14'],
 [],
 ['R52', 'K81', 'K80', 'K59'],
 ['R29', 'E87', 'K52', 'E34', 'R53', 'R41'],
 ['R06', 'A16', 'R19', 'R09', 'B34'],
 ['R22', 'R06', 'R11', 'K29', 'R19', 'R10', 'E14', 'R05'],
 ['A16', 'R19', 'R52', 'K92', 'R10'],
 ['N25', 'R19', 'S09', 'T14', 'R42', 'S80', 'S00', 'A35'],
 ['O30', 'R35'],
 ['R52', 'E14'],
 ['R52', 'R22', 'R10'],
 ['R11', 'R19', 'R40', 'R10', 'E14'],
 ['R52'],
 [],
 ['N25', 'S82', 'T14', 'M79'],
 ['N25', 'S82'],
 ['R11', 'R18', 'R14', 'K59', 'R52', 'F90', 'K56'],
 ['R52'],
 ['N25', 'S62', 'T14'],
 ['E87', 'R06', 'R19'],
 ['R19', 'R11', 'G44', 'R51'],
 ['N05', 'O24', 'K51', 'R05'],
 ['S05', 'A35', 'Q10', 'T14'],
 ['R63', 'R11', 'R31', 'R19', 'R52', 'K59', 'R30'],
 ['R05', 'I48', 'R19', 'J69', 'R10', 'A09'],
 ['R40', 'T14', 'E04', 'R19'],
 ['N94', 'N92'],
 ['R06', 'R53', 'R07'],
 ['R22', 'R39', 'N20', 'R19', 'R52', 'K40'],
 ['N25', 'M79', 'R52', 'S42'],
 ['O24', 'B17', 'K75', 'R19'],
 [],
 ['R06', 'I25', 'R09', 'E14', 'R05'],
 ['R26', 'R11', 'R19'],
 ['L97', 'L29'],
 ['R06', 'R07', 'K59', 'R19', 'Z72', 'A09'],
 ['M25', 'R11', 'K59', 'R19'],
 ['R23', 'R52', 'M79', 'E14', 'M54', 'L29', 'R11', 'R07'],
 ['G04', 'Z88', 'R19', 'I25', 'R26', 'E14', 'R41', 'R68', 'A41'],
 ['R19', 'K92', 'E14', 'M54', 'I89', 'R11', 'R50', 'R59', 'B18', 'R10'],
 ['Q66', 'T13', 'T14'],
 ['Z71', 'R06', 'F22', 'R20', 'F99', 'K92', 'R44', 'R50', 'B18', 'R45'],
 ['E87', 'R00', 'R07', 'R19', 'I95', 'R55', 'R53', 'D64'],
 ['I48', 'D61', 'R06', 'I51', 'R19'],
 ['R50', 'K29', 'R19', 'N30', 'R10'],
 ['R68'],
 ['R09', 'R06', 'I25', 'R05'],
 ['R19', 'N13', 'R30', 'R10'],
 ['R22', 'R06', 'A16', 'R51', 'B02', 'I48', 'R60', 'R11', 'R07', 'K59'],
 ['R11', 'K59', 'R19', 'R52', 'K64'],
 ['R52', 'T14'],
 ['J98', 'R11', 'R07'],
 ['W01'],
 ['Z71', 'R06', 'R30', 'R52'],
 ['R22', 'N39', 'R11', 'R39', 'R31', 'R10'],
 ['N50'],
 ['R51', 'R11', 'T14', 'R52', 'M54'],
 ['A97', 'R06', 'R07', 'R50', 'R19'],
 ['R26'],
 ['R52', 'K56', 'R10', 'K59'],
 ['M79'],
 ['R11', 'R51', 'R55', 'B18', 'R10', 'E14'],
 [],
 ['R10', 'R00', 'R07']]


In [None]:
results_df = pd.DataFrame()
results_df['hopi_'] = test_df['HOPI_modified']
results_df['approach_1'] = results
results_df['approach_2'] = test_df['predicted'].apply(lambda x: x.split(","))
results_df.head()

In [None]:
def get_inter(x):
    set1 = set(x['approach_1'])
    set2 = set(x['approach_2'])
    return set1.intersection(set2)

results_df['intersection'] = results_df.apply(get_inter,axis=1)

In [None]:
results_df['approach_1'] = results_df['approach_1'].apply(lambda x: ",".join(list(x)))
results_df['approach_2'] = results_df['approach_2'].apply(lambda x: ",".join(list(x)))
results_df['intersection'] = results_df['intersection'].apply(lambda x: ",".join(list(x)))

results_df.to_csv('results.csv',index=False)
results_df.head(20)

### Grid Search

In [54]:
corpus = ['med','icd','all']
stop_words = ['english',None] 
feature_length = [5,10,25]
sl_count = range(1,11)
metrics = ['cosine','euclidean','chebyshev','minkowski','manhattan']
levels = ['l1','l2','l3']
icd_desc = ['all','one']

delim = "$$$"
thresholds = [0.05,0.1,0.15,0.2,0.25,0.3,0.5,0.75,1]

In [55]:
def transformText(text):
    if not(type(text) is str):
        return ""
    doc = text.split("\n")
    sents = []
    for d in doc:
        tokens = d.split(' ')
        s = []
        for t in tokens:
            word = t.lower()
            if word in med_dict.keys():
                word = med_dict[word]
            s.append(word)
        s = " ".join(s)
        s = re.sub(r'[^\w\s]', '', s).strip()
        sents.append(s)
    sents = list(filter(lambda x: len(x) > 0,sents))
    sents = list(filter(lambda x: type(x) is str,sents))
    return  delim.join(sents)

def getICDCodes(level,desc):
    l = levels.index(level)
    all_codes = icd.get_all_codes(with_dots=True)
    code_df = pd.DataFrame(all_codes,columns=['code'])
    code_df['description'] = code_df['code'].apply(lambda x: icd.get_description(x))
    code_df['ancestor'] = code_df['code'].apply(lambda x: icd.get_ancestors(x))
    code_df['descendants'] = code_df['code'].apply(lambda x: icd.get_descendants(x))
    
    filtered = code_df[code_df.apply(lambda x: len(x['ancestor']),axis=1) == l]
    filtered = filtered.drop_duplicates(['description'])
    filtered = filtered.reset_index(drop=True)
        
    if desc == 'all':
        for c in filtered.iterrows():
            desc = [filtered.loc[c[0]].description]
            for d in c[1]['descendants']:
                desc.append(icd.get_description(d))
            filtered.loc[c[0]].description = " ".join(desc)
            
        filtered = filtered.drop('description', axis=1).join(filtered['description'].str.split('$', expand=True).stack().reset_index(level=1, drop=True).rename('description'))
        filtered.drop(['ancestor','descendants'],axis=1,inplace=True)
        filtered.reset_index(inplace=True)
            
    return filtered


def getVectorizer(corpus,stop_words,records,description):
    
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    if corpus == 'med':
        vectorizer.fit(records)
    if corpus == 'icd':
        vectorizer.fit(description)
    if corpus == 'all':
        vectorizer.fit(pd.concat([records,description]))
    
    return vectorizer

# def get_labels(i,n,dist,filtered):
#     nearest = list(zip(np.sort(dist)[i,:n],np.argsort(dist)[i,:n]))
#     temp = list()
#     for j in nearest:
#         code = filtered[j[1]]
#         temp.append(code)
        
#     return ",".join(temp)

def get_labels(doc, vectorizer,tsvd,test_vectors,metric,filtered,threshold):
    docs = doc.split(delim)
    doc_vectors = vectorizer.transform(docs)
    doc_vectors = tsvd.transform(doc_vectors)
    
    dist = pairwise_distances(doc_vectors,test_vectors,metric=metric)
    
    n=1 #suggested words
    temp = []
    for j in range(len(docs)):
        nearest = list(zip(np.sort(dist)[j,:n],np.argsort(dist)[j,:n]))
        for k in nearest:
            code = filtered[k[1]]
            if k[0] < threshold:
                temp.append(code)
        
    return ",".join(temp)

def getAncestors(x,level):
    if not(x is str):
        x = str(x)
    
    index = int(level[-1])
    codes = x.split(',')
    temp = list()
    for c in codes:
        i = c.strip()
        while(len(i) > 0 and not(icd.is_valid_item(i))):
            i = i[:-1]
        
        if len(i) == 0:
            continue 
        if len(icd.get_ancestors(i)) < index:
            temp.append(i)
        else:
            i = icd.get_ancestors(i)[-index]
        temp.append(i)
    
    
    return ",".join(temp)

def getScores(actual,predicted):
    recall = []
    precision = []
    f_score = []

    for i in range(test_df.shape[0]):
        s1 = set(actual[i].split(","))
        s2 = set(predicted[i].split(","))

        inter = len(s1.intersection(s2))
        r = inter/len(s1)
        recall.append(r)
        p = inter/len(s2)
        precision.append(p)

        try:
            f_score.append(2 * r * p/(r + p))
        except:
            f_score.append(0)
        
    test_df['recall'] = recall
    test_df['precision'] = precision
    test_df['f_score'] = f_score

    avg_recall = test_df['recall'].mean()
    avg_prec = test_df['precision'].mean()
    avg_f_score = test_df['f_score'].mean()
    
    return (avg_recall,avg_prec,avg_f_score)

In [56]:
med_dict = pd.read_excel(r'Data\medicalTermsDictionary (1).xlsx')
med_dict = dict(zip(med_dict.Abbreviation, med_dict.Term))

test_df = pd.read_excel(r'Data\Sample_HOPI.xlsx')
test_df['HOPI_modified'] = test_df['hopi_'].apply(lambda x: transformText(x))
test_df = test_df[test_df['HOPI_modified'].notna()]

df = pd.read_excel(r'Data/data.xlsx')
grid_df = pd.DataFrame(columns=['level','icd_desc','corpus','stop_words','feature_length','metric','threshold','recall','precision','f_score'])

In [57]:
for l in levels:  
    for d in icd_desc:
        codes = getICDCodes(l,d) 
        test_df['code_modified'] = test_df['code'].apply(lambda x: getAncestors(x,l))
        for c in corpus:
            for s in stop_words:
                vectorizer = getVectorizer(c,s,df['HOPI_modified'],codes['description'])
                icd_vec = vectorizer.transform(codes['description']).toarray()
                hopi_vec = vectorizer.transform(test_df['HOPI_modified']).toarray()
                clear_output(wait=True)
                for f in feature_length:
                    doc_vectors = csr_matrix(hopi_vec)
                    tsvd = TruncatedSVD(n_components=f)
                    _ = tsvd.fit(doc_vectors)
                    doc_vectors = tsvd.transform(doc_vectors)
                    test_vectors = tsvd.transform(csr_matrix(icd_vec))
                    
                    for m in metrics:
                        for t in thresholds:    
                            predictions = [get_labels(doc,vectorizer,tsvd,test_vectors,m,codes['code'],t) for doc in test_df['HOPI_modified']]
                            scores = getScores(test_df['code_modified'],predictions)
                            
                            row = pd.DataFrame([[l,d,c,s,f,m,t,scores[0],scores[1],scores[2]]], columns=grid_df.columns)
                            grid_df = pd.concat([grid_df,row])
                            print(row)

  level icd_desc corpus stop_words  feature_length  metric  threshold  \
0    l3      one    med       None               5  cosine       0.05   

     recall  precision   f_score  
0  0.029227   0.010235  0.014149  
  level icd_desc corpus stop_words  feature_length  metric  threshold  \
0    l3      one    med       None               5  cosine        0.1   

     recall  precision   f_score  
0  0.032129   0.009466  0.013668  
  level icd_desc corpus stop_words  feature_length  metric  threshold  \
0    l3      one    med       None               5  cosine       0.15   

     recall  precision   f_score  
0  0.032129   0.009448  0.013644  
  level icd_desc corpus stop_words  feature_length  metric  threshold  \
0    l3      one    med       None               5  cosine        0.2   

     recall  precision   f_score  
0  0.032129   0.009448  0.013644  
  level icd_desc corpus stop_words  feature_length  metric  threshold  \
0    l3      one    med       None               5  cosine 

In [None]:
grid_df.to_excel('grid_search_lemma.xlsx',index=False)

##### Old

In [None]:
for l in levels:  
    for d in icd_desc:
        codes = getICDCodes(l,d) 
        test_df['code_modified'] = test_df['code'].apply(lambda x: getAncestors(x,l))
        for c in corpus:
            for s in stop_words:
                vectorizer = getVectorizer(c,s,df['HOPI_modified'],codes['description'])
                icd_vec = vectorizer.transform(codes['description']).toarray()
                hopi_vec = vectorizer.transform(test_df['HOPI_modified']).toarray()
                clear_output(wait=True)
                for f in feature_length:
                    doc_vectors = csr_matrix(hopi_vec)
                    tsvd = TruncatedSVD(n_components=f)
                    _ = tsvd.fit(doc_vectors)
                    doc_vectors = tsvd.transform(doc_vectors)
                    test_vectors = tsvd.transform(csr_matrix(icd_vec))
                    
                    for m in metrics:
                        dist = pairwise_distances(doc_vectors,test_vectors,metric=m)
                        
                        for sl in sl_count:  
                            predictions = [get_labels(i,sl,dist,codes['code']) for i in range(0,test_df.shape[0])]
                            scores = getScores(test_df['code_modified'],predictions)
                            
                            row = pd.DataFrame([[l,d,c,s,f,sl,m,scores[0],scores[1],scores[2]]], columns=grid_df.columns)
                            grid_df = pd.concat([grid_df,row])
                            print(row)