# Comparing performance of Fuzzy, Usagi, and Medcat for matching drugs

## Define base directory

In [1]:
baseDir = '/superbugai-data/yash/temp'

## Read concept names

In [9]:
import pandas as pd

conceptsDf = pd.read_csv(baseDir + '/' + 'concept_names.txt', sep='\t')
conceptsDf = conceptsDf[conceptsDf.source_vocabulary_id == 'mimiciv_drug_ndc']
conceptsDf

Unnamed: 0,concept_name,concept_code,source_concept_id,source_vocabulary_id,source_domain_id,source_concept_class_id,target_concept_id,relationship_id,reverese_relationship_id,concept_name_con,domain_id,vocab_id,concept_class_id,standard
5,"Ultralente Insulin, Human 100 UNT/ML [Humulin ...",Ultralente Humulin Insulin 100 units per ml,2000010001,mimiciv_drug_ndc,Drug,Prescription Drug,19120831,Maps to,Mapped from,"ultralente insulin, human 100 UNT/ML [Humulin ...",Drug,RxNorm,Branded Drug Comp,S
6,Vitamin B 12 0.5 MG/ACTUAT Nasal Spray [Nascobal],Nascobal 500mcg,2000010002,mimiciv_drug_ndc,Drug,Prescription Drug,40165411,Maps to,Mapped from,vitamin B12 0.5 MG/ACTUAT Nasal Spray [Nascobal],Drug,RxNorm,Branded Drug,S
7,Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML...,Combigan 0.2-0.5%,2000010003,mimiciv_drug_ndc,Drug,Prescription Drug,40164140,Maps to,Mapped from,brimonidine tartrate 2 MG/ML / timolol 5 MG/ML...,Drug,RxNorm,Branded Drug Comp,S
8,2.5 ML defibrotide sodium 80 MG/ML Injection,Defibrotide <IND> 80 mg / mL - 2.5 mL Amp,2000010004,mimiciv_drug_ndc,Drug,Prescription Drug,35603976,Maps to,Mapped from,2.5 ML defibrotide sodium 80 MG/ML Injection,Drug,RxNorm,Quant Clinical Drug,S
9,Budesonide 0.5 MG/ML,Budesonide (Nasal) 0.5mg,2000010005,mimiciv_drug_ndc,Drug,Prescription Drug,939422,Maps to,Mapped from,budesonide 0.5 MG/ML,Drug,RxNorm,Clinical Drug Comp,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1405,Sodium Chloride 0.9% Flush 10 mL Syringe,Sodium Chloride 0.9% Flush 10 mL Syringe,2000011396,mimiciv_drug_ndc,Drug,Prescription Drug,19127213,Maps to,Mapped from,10 ML sodium chloride 9 MG/ML Prefilled Syringe,Drug,RxNorm,Quant Clinical Drug,S
1406,Sterile Water 50 mL Bag,Sterile Water 50 mL Bag,2000011397,mimiciv_drug_ndc,Drug,Prescription Drug,43647680,Maps to,Mapped from,50 ML Water 1000 MG/ML Injectable Solution,Drug,RxNorm Extension,Quant Clinical Drug,S
1407,SW 100ml Bag,SW 100ml Bag,2000011398,mimiciv_drug_ndc,Drug,Prescription Drug,43613338,Maps to,Mapped from,100 ML Water 1000 MG/ML Injectable Solution,Drug,RxNorm Extension,Quant Clinical Drug,S
1408,SW 50 mL Bag,SW 50 mL Bag,2000011399,mimiciv_drug_ndc,Drug,Prescription Drug,43647680,Maps to,Mapped from,50 ML Water 1000 MG/ML Injectable Solution,Drug,RxNorm Extension,Quant Clinical Drug,S


## Read RxNorm Vocabulary

In [6]:
import pandas as pd

rxnormDf = pd.read_csv(baseDir + '/' + 'trained_vocs/rrf/RXNCONSO.RRF', sep='|', header=None)
rxnormDf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,3,ENG,,,,,,8717795,,58488005,,SNOMEDCT_US,PT,58488005,"1,4-alpha-Glucan branching enzyme",,N,,
1,3,ENG,,,,,,8717796,,58488005,,SNOMEDCT_US,FN,58488005,"1,4-alpha-Glucan branching enzyme (substance)",,N,,
2,3,ENG,,,,,,8717808,,58488005,,SNOMEDCT_US,SY,58488005,"Amylo-(1,4,6)-transglycosylase",,N,,
3,3,ENG,,,,,,8718164,,58488005,,SNOMEDCT_US,SY,58488005,Branching enzyme,,N,,
4,19,ENG,,,,,,10794494,,112116001,,SNOMEDCT_US,SY,112116001,17-hydrocorticosteroid,,N,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118617,2612162,ENG,,,,,,12763011,,,,MTHSPL,DP,37662-1564,VIROLA SEBIFERA RESIN 200 [hp_C] ORAL PELLET,,N,,
1118618,2612163,ENG,,,,,,12763012,,,,MTHSPL,DP,37662-1562,VIROLA SEBIFERA RESIN 30 [hp_C] ORAL PELLET,,N,,
1118619,2612164,ENG,,,,,,12763013,,,,MTHSPL,DP,37662-1565,VIROLA SEBIFERA RESIN 500 [hp_C] ORAL PELLET,,N,,
1118620,2612165,ENG,,,,,,12763014,,,,MTHSPL,DP,37662-1560,VIROLA SEBIFERA RESIN 6 [hp_C] ORAL PELLET,,N,,


## Define a function to perform fuzzy mapping

In [11]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def performFuzzyMatching(conceptId, conceptName, conceptVocabularyId):
    matchingConcept = process.extract(conceptName,  rxnormDf[14], limit=1, scorer=fuzz.token_sort_ratio)
    return conceptId, conceptName, conceptVocabularyId, matchingConcept



In [12]:
from multiprocessing import Pool
matchingOutputFuzzy = []
with Pool(15) as p:
    matchingOutputFuzzy.append(
            p.starmap(
            performFuzzyMatching
            , zip(
                conceptsDf.source_concept_id
                , conceptsDf.concept_name
                , conceptsDf.source_vocabulary_id
                )
        )
    )

In [19]:
matchingOutputFuzzyDf = pd.DataFrame(matchingOutputFuzzy[0], columns=['concept_id', 'concept_name', 'source_vocabulary_id', 'matching_concept'])
matchingOutputFuzzyDf

Unnamed: 0,concept_id,concept_name,source_vocabulary_id,matching_concept
0,2000010001,"Ultralente Insulin, Human 100 UNT/ML [Humulin ...",mimiciv_drug_ndc,"[(ultralente insulin, human 100 UNT/ML [Humuli..."
1,2000010002,Vitamin B 12 0.5 MG/ACTUAT Nasal Spray [Nascobal],mimiciv_drug_ndc,[(vitamin B12 0.5 MG/ACTUAT Nasal Spray [Nasco...
2,2000010003,Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML...,mimiciv_drug_ndc,[(brimonidine tartrate 2 MG/ML / timolol 5 MG/...
3,2000010004,2.5 ML defibrotide sodium 80 MG/ML Injection,mimiciv_drug_ndc,[(2.5 ML defibrotide sodium 80 MG/ML Injection...
4,2000010005,Budesonide 0.5 MG/ML,mimiciv_drug_ndc,"[(budesonide 0.5 MG/ML, 100, 309593)]"
...,...,...,...,...
1395,2000011396,Sodium Chloride 0.9% Flush 10 mL Syringe,mimiciv_drug_ndc,[(sodium chloride 0.9 % (flush) 0.9 % INJECTIO...
1396,2000011397,Sterile Water 50 mL Bag,mimiciv_drug_ndc,"[(Water, Sterile, 72, 94517)]"
1397,2000011398,SW 100ml Bag,mimiciv_drug_ndc,"[(AGS-005, 63, 939293)]"
1398,2000011399,SW 50 mL Bag,mimiciv_drug_ndc,"[(TPN Bag 250 mL, 77, 442149)]"


In [23]:
import ast

matchingOutputFuzzyDf["Mapped Concept Fuzzy"] = matchingOutputFuzzyDf["matching_concept"]\
                                            .apply(lambda matchingConceptList: matchingConceptList[0][0])
matchingOutputFuzzyDf["Mapped ID Fuzzy"] = matchingOutputFuzzyDf["matching_concept"]\
                                            .apply(lambda matchingConceptList: matchingConceptList[0][2])
matchingOutputFuzzyDf["Mapped Score Fuzzy"] = matchingOutputFuzzyDf["matching_concept"]\
                                            .apply(lambda matchingConceptList: matchingConceptList[0][1])
matchingOutputFuzzyDf = matchingOutputFuzzyDf[matchingOutputFuzzyDf['concept_name'].notna()]
matchingOutputFuzzyDf.drop(columns='matching_concept', inplace=True)
matchingOutputFuzzyDf

Unnamed: 0,concept_id,concept_name,source_vocabulary_id,Mapped Concept Fuzzy,Mapped ID Fuzzy,Mapped Score Fuzzy
0,2000010001,"Ultralente Insulin, Human 100 UNT/ML [Humulin ...",mimiciv_drug_ndc,"ultralente insulin, human 100 UNT/ML [Humulin ...",395098,100
1,2000010002,Vitamin B 12 0.5 MG/ACTUAT Nasal Spray [Nascobal],mimiciv_drug_ndc,vitamin B12 0.5 MG/ACTUAT Nasal Spray [Nascobal],525254,95
2,2000010003,Brimonidine tartrate 2 MG/ML / Timolol 5 MG/ML...,mimiciv_drug_ndc,brimonidine tartrate 2 MG/ML / timolol 5 MG/ML...,527187,100
3,2000010004,2.5 ML defibrotide sodium 80 MG/ML Injection,mimiciv_drug_ndc,2.5 ML defibrotide sodium 80 MG/ML Injection,911273,100
4,2000010005,Budesonide 0.5 MG/ML,mimiciv_drug_ndc,budesonide 0.5 MG/ML,309593,100
...,...,...,...,...,...,...
1395,2000011396,Sodium Chloride 0.9% Flush 10 mL Syringe,mimiciv_drug_ndc,sodium chloride 0.9 % (flush) 0.9 % INJECTION ...,1045059,85
1396,2000011397,Sterile Water 50 mL Bag,mimiciv_drug_ndc,"Water, Sterile",94517,72
1397,2000011398,SW 100ml Bag,mimiciv_drug_ndc,AGS-005,939293,63
1398,2000011399,SW 50 mL Bag,mimiciv_drug_ndc,TPN Bag 250 mL,442149,77


In [25]:
import pandas as pd

matchingOutputFuzzyDf.to_csv(baseDir + '/mapped_drugs_fuzzy.csv')