In [1]:
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer, util
import torch
import time
from bert_score import score
import itertools

import re

import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
df_fda = pd.read_csv('fda_cfr.csv')
fda_new = df_fda.copy()
print(fda_new.shape)
fda_new.head()

(8404, 5)


Unnamed: 0,FDA_Chapter,FDA_Section,CFR,FDA_Subpart,FDA_Description
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,..."
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food..."
3,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.20""",Presence of mandatory label information.,In the regulations specified in 1.1(c) of this...
4,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.21""",Failure to reveal material facts.,"(a) Labeling of a food, drug, device, cosmetic..."


In [3]:
# Reserved data
fda_reserved = fda_new.loc[fda_new['FDA_Subpart'] == '[Reserved]']
print(fda_reserved.shape)
fda_reserved.head()

(488, 5)


Unnamed: 0,FDA_Chapter,FDA_Section,CFR,FDA_Subpart,FDA_Description
998,,,"""84""",[Reserved],
999,,,"""85""",[Reserved],
1000,,,"""86""",[Reserved],
1001,,,"""87""",[Reserved],
1002,,,"""88""",[Reserved],


In [4]:
# Extract Reserved regulations
cond = fda_new['FDA_Subpart'].isin(fda_reserved['FDA_Subpart'])
fda_new.drop(fda_new[cond].index, inplace = True)
fda_new.reset_index(drop=True, inplace=True)
print(fda_new.shape)
fda_new.head()

(7916, 5)


Unnamed: 0,FDA_Chapter,FDA_Section,CFR,FDA_Subpart,FDA_Description
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,..."
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food..."
3,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.20""",Presence of mandatory label information.,In the regulations specified in 1.1(c) of this...
4,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.21""",Failure to reveal material facts.,"(a) Labeling of a food, drug, device, cosmetic..."


In [5]:
print(fda_new.shape)
fda_new.isnull().sum()

(7916, 5)


FDA_Chapter        0
FDA_Section        0
CFR                0
FDA_Subpart        0
FDA_Description    0
dtype: int64

In [6]:
fda_new['fda_desc_cleaned'] = ''

# Pre-processing FDA Descriptions
lemmatizer = WordNetLemmatizer()
pattern1 = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
pattern2 = '[0-9]'
for i in range(0,len(fda_new)):
        sent_ref = fda_new.iloc[i]['FDA_Description']
        sent_ref = sent_ref.lower() # convert to lower case
        word_list = nltk.word_tokenize(sent_ref)
        sent_ref = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) # lemmatize
        sent_ref = re.sub(r'\([^)]*\)', '', sent_ref) # remove characters which are within () and []
        sent_ref = pattern1.sub('', sent_ref) # remove stopwords
        sent_ref = re.sub(r'[^\w\s]','',sent_ref) # remove spaces (new line, tabs etc)
        sent_ref = re.sub(pattern2, '', sent_ref) # remove numbers
        sent_ref = " ".join(sent_ref.split()) # remove whitespaces, if any
        fda_new['fda_desc_cleaned'][i] = sent_ref
        
fda_new

Unnamed: 0,FDA_Chapter,FDA_Section,CFR,FDA_Subpart,FDA_Description,fda_desc_cleaned
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...,provision regulation promulgated federal food ...
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,...",labeling includes written printed graphic matt...
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food...",part regulation food drug administration inclu...
3,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.20""",Presence of mandatory label information.,In the regulations specified in 1.1(c) of this...,regulation specified chapter term package mean...
4,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.21""",Failure to reveal material facts.,"(a) Labeling of a food, drug, device, cosmetic...",labeling food drug device cosmetic tobacco pro...
...,...,...,...,...,...,...
7911,MANDATORY DECLASSIFICATION REVIEW,Information in the custody of ONDCP.,"""1402.3""",Information in the custody of ONDCP.,Information contained in ONDCP files and under...,information contained ondcp file exclusive dec...
7912,MANDATORY DECLASSIFICATION REVIEW,Information classified by another agency.,"""1402.4""",Information classified by another agency.,When a request is received for information tha...,request received information wa classified ano...
7913,MANDATORY DECLASSIFICATION REVIEW,Appeal procedure.,"""1402.5""",Appeal procedure.,Appeals reviewed as a result of a denial will ...,appeal reviewed result denial routed director ...
7914,MANDATORY DECLASSIFICATION REVIEW,Fees.,"""1402.6""",Fees.,There will normally be no fees charged for the...,normally fee charged mandatory review classifi...


In [7]:
# Read Health Canada data
df_hc = pd.read_csv('hc_crc.csv',engine='python')
hc_new = df_hc.copy()
hc_new.drop(['Level4'], axis = 1, inplace = True)# Drop these columns
hc_new['HC_Subpart'].fillna('Not Available', inplace = True) 
print(hc_new.shape)
hc_new.head()

(1860, 5)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,CRC,HC_Description
0,Administration,General,Not Available,A.01.001,These Regulations may be cited as the Food an...
1,Administration,General,Not Available,A.01.002,"These Regulations, where applicable, prescribe..."
2,Administration,General,Not Available,A.01.003,"[Repealed, SOR/94-289, s. 1]"
3,Administration,Interpretation,Not Available,A.01.010,In these Regulations acceptable method Act Foo...
4,Administration,Interpretation,Not Available,A.01.011,"The Minister shall, upon request, furnish copi..."


In [8]:
hc_reserved = hc_new.loc[(hc_new['HC_Description'] == '[Reserved]')] 
print(hc_reserved.shape)
hc_reserved.head()

(86, 5)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,CRC,HC_Description
162,Foods,Nutrient Content Claims,Nutrient Content Claims,B.01.514,[Reserved]
163,Foods,Nutrient Content Claims,Nutrient Content Claims,B.01.515,[Reserved]
164,Foods,Nutrient Content Claims,Nutrient Content Claims,B.01.516,[Reserved]
165,Foods,Nutrient Content Claims,Nutrient Content Claims,B.01.517,[Reserved]
166,Foods,Nutrient Content Claims,Nutrient Content Claims,B.01.518,[Reserved]


In [9]:
hc_new['counter']=''

for i in range(0,len(hc_new)):
    txt = hc_new['HC_Description'][i]
    x = re.search('^\[Repealed*', txt)
    if(x!=None):
        hc_new['counter'][i]=1
        
hc_repealed = hc_new.loc[hc_new['counter'] == 1]
hc_repealed.drop(['counter'], axis=1, inplace = True)
print(hc_repealed.shape)
hc_repealed.head()

(350, 5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,CRC,HC_Description
2,Administration,General,Not Available,A.01.003,"[Repealed, SOR/94-289, s. 1]"
11,Administration,Analysts; Inspectors,Not Available,A.01.020,"[Repealed, SOR/81-935, s. 1]"
12,Administration,Analysts; Inspectors,Not Available,A.01.021,"[Repealed, SOR/81-935, s. 1]"
31,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,A.01.064,"[Repealed, SOR/93-243, s. 2]"
69,Foods,General,General,B.01.034,"[Repealed, SOR/88-559, s. 7]"


In [10]:
hc_repealed_reserved = pd.concat([hc_repealed,hc_reserved])
print(hc_repealed_reserved.shape)
hc_repealed_reserved.head()

(436, 5)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,CRC,HC_Description
2,Administration,General,Not Available,A.01.003,"[Repealed, SOR/94-289, s. 1]"
11,Administration,Analysts; Inspectors,Not Available,A.01.020,"[Repealed, SOR/81-935, s. 1]"
12,Administration,Analysts; Inspectors,Not Available,A.01.021,"[Repealed, SOR/81-935, s. 1]"
31,Administration,Labelling of Food and Drugs in Pressurized Con...,Not Available,A.01.064,"[Repealed, SOR/93-243, s. 2]"
69,Foods,General,General,B.01.034,"[Repealed, SOR/88-559, s. 7]"


In [11]:
hc_new.drop(['counter'], axis=1, inplace = True)

cond = hc_new['CRC'].isin(hc_repealed_reserved['CRC'])
hc_new.drop(hc_new[cond].index, inplace = True)

print(hc_new.shape)
hc_new.head()

(1423, 5)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,CRC,HC_Description
0,Administration,General,Not Available,A.01.001,These Regulations may be cited as the Food an...
1,Administration,General,Not Available,A.01.002,"These Regulations, where applicable, prescribe..."
3,Administration,Interpretation,Not Available,A.01.010,In these Regulations acceptable method Act Foo...
4,Administration,Interpretation,Not Available,A.01.011,"The Minister shall, upon request, furnish copi..."
5,Administration,Interpretation,Not Available,A.01.012,"The Minister shall, upon request, indicate tha..."


In [12]:
hc_new.reset_index(drop=True, inplace=True)
print(hc_new.shape)
hc_new.isnull().sum()

(1423, 5)


HC_Chapter        0
HC_Section        0
HC_Subpart        0
CRC               0
HC_Description    0
dtype: int64

In [13]:
hc_new['hc_desc_cleaned'] = ''

# Pre-processing Health Canada Descriptions
lemmatizer = WordNetLemmatizer()
pattern1 = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
pattern2 = '[0-9]'
for i in range(0,len(hc_new)):
        sent_ref = hc_new.iloc[i]['HC_Description']
        sent_ref = sent_ref.lower() # convert to lower case
        word_list = nltk.word_tokenize(sent_ref)
        sent_ref = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) # lemmatize
        sent_ref = re.sub(r'\([^)]*\)', '', sent_ref) # remove characters which are within parenthesis
        sent_ref = pattern1.sub('', sent_ref) # remove stopwords
        sent_ref = re.sub(r'[^\w\s]','',sent_ref) # remove spaces (new line, tabs etc)
        sent_ref = re.sub(pattern2, '', sent_ref) # remove numbers
        sent_ref = " ".join(sent_ref.split()) # remove whitespaces, if any
        hc_new['hc_desc_cleaned'][i] = sent_ref
        
hc_new

Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,CRC,HC_Description,hc_desc_cleaned
0,Administration,General,Not Available,A.01.001,These Regulations may be cited as the Food an...,regulation may cited food drug regulation
1,Administration,General,Not Available,A.01.002,"These Regulations, where applicable, prescribe...",regulation applicable prescribe standard compo...
2,Administration,Interpretation,Not Available,A.01.010,In these Regulations acceptable method Act Foo...,regulation acceptable method act food drug act...
3,Administration,Interpretation,Not Available,A.01.011,"The Minister shall, upon request, furnish copi...",minister shall upon request furnish copy offic...
4,Administration,Interpretation,Not Available,A.01.012,"The Minister shall, upon request, indicate tha...",minister shall upon request indicate method ac...
...,...,...,...,...,...,...
1418,Restricted Drugs,Documents,Record Keeping,J.01.084,"Retention period A licensed dealer, a former l...",retention period licensed dealer former licens...
1419,Restricted Drugs,Documents,Record Keeping,J.01.085,Location The documents must be kept (a) in the...,location document must kept case licensed deal...
1420,Restricted Drugs,Documents,Record Keeping,J.01.086,Quality of documents The documents must be com...,quality document document must complete readil...
1421,Restricted Drugs,Notification of Application for Order of Resto...,Not Available,J.01.087,Written notification (1) For the purpose of su...,written notification purpose subsection act no...


In [14]:
# model embedding
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # distilled BERT model

In [15]:
# define Corpus 
corpus = fda_new['fda_desc_cleaned'].to_list()
# corpus embedding
corpus_embeddings = embedder.encode(corpus, convert_to_tensor = True)

In [16]:
# define queries
queries = hc_new['hc_desc_cleaned'].to_list()

In [17]:
hc_new['matched'] = ''
hc_new['score']=''
hc_new['CFR']=''
hc_new['FDA_Chapter']=''
hc_new['FDA_Section']=''
hc_new['FDA_Subpart']=''
hc_new['FDA_Description']=''
hc_new['fda_desc_cleaned']=''

fda_new.head()

Unnamed: 0,FDA_Chapter,FDA_Section,CFR,FDA_Subpart,FDA_Description,fda_desc_cleaned
0,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.1""",General.,(a) The provisions of regulations promulgated ...,provision regulation promulgated federal food ...
1,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.3""",Definitions.,"(a) Labeling includes all written, printed,...",labeling includes written printed graphic matt...
2,GENERAL ENFORCEMENT REGULATIONS,General Provisions,"""1.4""",Authority citations.,"(a) For each part of its regulations, the Food...",part regulation food drug administration inclu...
3,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.20""",Presence of mandatory label information.,In the regulations specified in 1.1(c) of this...,regulation specified chapter term package mean...
4,GENERAL ENFORCEMENT REGULATIONS,General Labeling Requirements,"""1.21""",Failure to reveal material facts.,"(a) Labeling of a food, drug, device, cosmetic...",labeling food drug device cosmetic tobacco pro...


In [18]:
threshold = float(0.5)

for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor = True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()
    top_results = torch.topk(cos_scores, k=1)
    
    hc_new['score'][queries.index(query)] = float(top_results[0])
          
    if (float(top_results[0])) <= threshold:
        hc_new['matched'][queries.index(query)] = 0 # if score is less than threshold, it is not a match
        

    elif (float(top_results[0])) > threshold:
        hc_new['matched'][queries.index(query)] = 1
        hc_new['CFR'][queries.index(query)] = fda_new['CFR'][float(top_results[1])]
        hc_new['FDA_Chapter'][queries.index(query)] = fda_new['FDA_Chapter'][float(top_results[1])]
        hc_new['FDA_Section'][queries.index(query)] = fda_new['FDA_Section'][float(top_results[1])]
        hc_new['FDA_Subpart'][queries.index(query)] = fda_new['FDA_Subpart'][float(top_results[1])]
        hc_new['FDA_Description'][queries.index(query)] = fda_new['FDA_Description'][float(top_results[1])]
        hc_new['fda_desc_cleaned'][queries.index(query)] = fda_new['fda_desc_cleaned'][float(top_results[1])]

In [19]:
hc_new.rename(columns = {'CRC':'HC_Code',
                         'CFR':'FDA_Code'}, inplace = True)

In [20]:
hc_new.to_csv(r'hc_matched_new.csv',index = False)

In [21]:
print(hc_new.shape)
hc_new.head()

(1423, 14)


Unnamed: 0,HC_Chapter,HC_Section,HC_Subpart,HC_Code,HC_Description,hc_desc_cleaned,matched,score,FDA_Code,FDA_Chapter,FDA_Section,FDA_Subpart,FDA_Description,fda_desc_cleaned
0,Administration,General,Not Available,A.01.001,These Regulations may be cited as the Food an...,regulation may cited food drug regulation,1,0.758713,"""56.105""",GENERAL ENFORCEMENT REGULATIONS,General Provisions,Waiver of IRB requirement.,On the application of a sponsor or sponsor-inv...,application sponsor sponsorinvestigator food d...
1,Administration,General,Not Available,A.01.002,"These Regulations, where applicable, prescribe...",regulation applicable prescribe standard compo...,1,0.764611,"""184.1851""",DIRECT FOOD SUBSTANCES AFFIRMED AS GENERALLY R...,Listing of Specific Substances Affirmed as GRAS,Stearyl citrate.,"(a) Stearyl citrate is a mixture of the mono-,...",stearyl citrate mixture mono di tristearyl est...
2,Administration,Interpretation,Not Available,A.01.010,In these Regulations acceptable method Act Foo...,regulation acceptable method act food drug act...,1,0.769075,"""200.7""",GENERAL,General Provisions,Supplying pharmacists with indications and dos...,There are presently no regulations under the F...,presently regulation federal food drug cosmeti...
3,Administration,Interpretation,Not Available,A.01.011,"The Minister shall, upon request, furnish copi...",minister shall upon request furnish copy offic...,1,0.637533,"""117.320""","CURRENT GOOD MANUFACTURING PRACTICE, HAZARD AN...",Requirements Applying to Records That Must Be ...,Requirements for official review.,All records required by this part must be made...,record required part must made promptly availa...
4,Administration,Interpretation,Not Available,A.01.012,"The Minister shall, upon request, indicate tha...",minister shall upon request indicate method ac...,1,0.613881,"""1316.55""","ADMINISTRATIVE FUNCTIONS, PRACTICES, AND PROCE...",Administrative Hearings,Prehearing ruling.,The presiding officer may have the prehearing ...,presiding officer may prehearing conference re...
