In [1]:
import numpy as np
import pandas as pd

import math
from numpy.linalg import norm

from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances

import warnings
warnings.filterwarnings("ignore")

import re
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sentence_transformers import SentenceTransformer

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score

In [2]:
def cosine_sim(src, tgt):
    similarity = cosine_similarity(src, tgt)
    return similarity

def euclidean_dist(src, tgt):
    distance = euclidean_distances(src, tgt)
    return distance

def vector_summation(sentences):
    sent_len = sentences.shape[1]
    summed_sentence = sentences.sum(axis=1) / sent_len
    return summed_sentence

def show_topics(word_topic_matrix, word_labels, num_top_words=5):
    top_words_func = lambda x: [word_labels[i] for i in np.argsort(x)[:-num_top_words-1:-1]]
    topic_words = ([top_words_func(i) for i in word_topic_matrix])
    return [' '.join(x) for x in topic_words]

In [3]:
data = pd.read_excel('review.xlsx')
data.drop(['Unnamed: 13'],axis=1,inplace=True)

data['y_true'] = ''
for i in range(0,len(data)):
    if data['truth'][i] == 'Y':
        data['y_true'][i] = 1
    elif data['truth'][i] == 'N':
        data['y_true'][i] = 0 
        
data_Interpretation= data.loc[data['HC_Subpart'] == 'Interpretation']
data_General = data.loc[data['HC_Subpart'] == 'General']
data_removal = pd.concat([data_Interpretation,data_General])
cond = data['HC_Subpart'].isin(data_removal['HC_Subpart'])
data.drop(data[cond].index, inplace = True)
data.reset_index(drop=True, inplace=True)
        
data['fda_desc_cleaned']=''
data['hc_desc_cleaned']=''

# Pre-processing FDA Descriptions
lemmatizer = WordNetLemmatizer()
pattern1 = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
pattern2 = '[0-9]'
for i in range(0,len(data)):
        sent_ref = data.iloc[i]['FDA_Description']
#         sent_ref = sent_ref.lower() # convert to lower case
        word_list = nltk.word_tokenize(sent_ref)
        sent_ref = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) # lemmatize
        sent_ref = re.sub(r'\([^)]*\)', '', sent_ref) # remove characters which are within () and []
        sent_ref = pattern1.sub('', sent_ref) # remove stopwords
        sent_ref = re.sub(r'[^\w\s]','',sent_ref) # remove spaces (new line, tabs etc)
        sent_ref = re.sub(pattern2, '', sent_ref) # remove numbers
        sent_ref = " ".join(sent_ref.split()) # remove whitespaces, if any
        data['fda_desc_cleaned'][i] = sent_ref
        
# Pre-processing Health Canada Descriptions
lemmatizer = WordNetLemmatizer()
pattern1 = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
pattern2 = '[0-9]'
for i in range(0,len(data)):
        sent_ref = data.iloc[i]['HC_Description']
#         sent_ref = sent_ref.lower() # convert to lower case
        word_list = nltk.word_tokenize(sent_ref)
        sent_ref = ' '.join([lemmatizer.lemmatize(w) for w in word_list]) # lemmatize
        sent_ref = re.sub(r'\([^)]*\)', '', sent_ref) # remove characters which are within parenthesis
        sent_ref = pattern1.sub('', sent_ref) # remove stopwords
        sent_ref = re.sub(r'[^\w\s]','',sent_ref) # remove spaces (new line, tabs etc)
        sent_ref = re.sub(pattern2, '', sent_ref) # remove numbers
        sent_ref = " ".join(sent_ref.split()) # remove whitespaces, if any
        data['hc_desc_cleaned'][i] = sent_ref
        
data.head(3)

Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,HC_Description,truth,y_true,fda_desc_cleaned,hc_desc_cleaned
0,CURRENT GOOD MANUFACTURING PRACTICE FOR FINISH...,Buildings and Facilities,"""211.56""",Sanitation.,"(a) Any building used in the manufacture, proc...",1,0.773877,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y,1,Any building used manufacture processing packi...,The premise lot batch drug fabricated packaged...
1,CURRENT GOOD MANUFACTURING PRACTICE FOR TYPE A...,Construction and Maintenance of Facilities and...,"""226.20""",Buildings.,Buildings in which Type A medicated article(s)...,1,0.724563,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y,1,Buildings Type A medicated article manufacture...,The premise lot batch drug fabricated packaged...
2,CURRENT GOOD MANUFACTURING PRACTICE FOR FINISH...,Buildings and Facilities,"""211.58""",Maintenance.,"Any building used in the manufacture, processi...",1,0.717186,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y,1,Any building used manufacture processing packi...,The premise lot batch drug fabricated packaged...


In [4]:
embedder = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [5]:
data['y_bert'] = ''
data['score_bert'] = ''

for i in range(0,len(data)):
    sent1 = data['fda_desc_cleaned'][i]
    sent2 = data['hc_desc_cleaned'][i]
    enc_sentences = embedder.encode([sent1,sent2])
    data['score_bert'][i] = cosine_sim(enc_sentences[0:1],enc_sentences)[0,1]
    
for i in range(0,len(data)):
    if data['score_bert'][i] >= 0.81:
        data['y_bert'][i] = 1
    else:
        data['y_bert'][i] = 0
    
data['cm_bert']=''
for k in range(0,len(data)):
    if (data['y_true'][k] == 1) and (data['y_bert'][k] == 1):
        data['cm_bert'][k] = 'TP'
    elif (data['y_true'][k] == 0) and (data['y_bert'][k] == 1):
        data['cm_bert'][k] = 'FP'
    elif (data['y_true'][k] == 1) and (data['y_bert'][k] == 0):
        data['cm_bert'][k] = 'FN'
    else:
        data['cm_bert'][k] = 'TN'

print('---------- Bert + Cosine ----------')
tp = data.cm_bert.value_counts().TP if 'TP' in data.cm_bert.value_counts() is not None else 0
print('true positives:',tp)
tn = data.cm_bert.value_counts().TN if 'TN' in data.cm_bert.value_counts() is not None else 0
print('true negatives:',tn)
fp = data.cm_bert.value_counts().FP if 'FP' in data.cm_bert.value_counts() is not None else 0
print('false positives:',fp)
fn = data.cm_bert.value_counts().FN if 'FN' in data.cm_bert.value_counts() is not None else 0
print('false negatives:',fn)

precision = (tp/(tp+fp)) if (tp+fp) is not 0 else 0
print('precision:',precision)

specificity = (tn / (tn + fp)) if (tn + fp) is not 0 else 0
print('specificity:',specificity)
    
recall = (tp/(tp+fn)) if (tp+fn) is not 0 else 0
print('recall:',recall)   

acc = ((tp+tn)/(tp+tn+fp+fn)) if (tp+tn+fp+fn) is not 0 else 0
print('accuracy:',acc)

data.head(3)

---------- Bert + Cosine ----------
true positives: 13
true negatives: 43
false positives: 26
false negatives: 6
precision: 0.3333333333333333
specificity: 0.6231884057971014
recall: 0.6842105263157895
accuracy: 0.6363636363636364


Unnamed: 0,FDA_Chapter,FDA_Section,FDA_Code,FDA_Subpart,FDA_Description,matched,score,HC_Code,HC_Chapter,HC_Section,HC_Subpart,HC_Description,truth,y_true,fda_desc_cleaned,hc_desc_cleaned,y_bert,score_bert,cm_bert
0,CURRENT GOOD MANUFACTURING PRACTICE FOR FINISH...,Buildings and Facilities,"""211.56""",Sanitation.,"(a) Any building used in the manufacture, proc...",1,0.773877,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y,1,Any building used manufacture processing packi...,The premise lot batch drug fabricated packaged...,0,0.797674,FN
1,CURRENT GOOD MANUFACTURING PRACTICE FOR TYPE A...,Construction and Maintenance of Facilities and...,"""226.20""",Buildings.,Buildings in which Type A medicated article(s)...,1,0.724563,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y,1,Buildings Type A medicated article manufacture...,The premise lot batch drug fabricated packaged...,1,0.81801,TP
2,CURRENT GOOD MANUFACTURING PRACTICE FOR FINISH...,Buildings and Facilities,"""211.58""",Maintenance.,"Any building used in the manufacture, processi...",1,0.717186,C.02.004,Drugs,Good Manufacturing Practices,Premises,The premises in which a lot or batch of a drug...,Y,1,Any building used manufacture processing packi...,The premise lot batch drug fabricated packaged...,1,0.829109,TP


In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
bow1 = CountVectorizer(stop_words = 'english',
                       min_df = 8, 
                       max_df = 25, 
                       ngram_range = (1,2),
                       max_features = 10000)

documents_bow1 = bow1.fit_transform(data['fda_desc_cleaned'])

scores1 = []
for k in range(1, 11):
    lda = LatentDirichletAllocation(n_components=k, random_state = 100)
    lda.fit(documents_bow1)
    scores1.append(lda.score(documents_bow1)) # loglikelihood score
    
max_score1 = np.argmax(scores1)
num_topics1 = max_score1 + 1

lda1 = LatentDirichletAllocation(n_components = num_topics1, random_state = 100) 
lda_results1 = lda1.fit_transform(documents_bow1) 

lda_results1 = pd.DataFrame(lda_results1)
topic_lst1 = show_topics(lda1.components_, bow1.get_feature_names())

lda_results1['topic_number'] = lda_results1.idxmax(axis=1)
lda_results1['topic'] = ''
for i in range(0,len(lda_results1)):
    x = lda_results1.topic_number[i]
    lda_results1['topic'][i] = topic_lst1[x]

lda_results1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,topic_number,topic
0,0.0858,0.003127,0.118544,0.003125,0.003126,0.003126,0.284433,0.163771,0.331823,0.003126,8,component container device intended lot
1,0.001429,0.001429,0.287255,0.001429,0.001429,0.001429,0.454924,0.001429,0.24782,0.001429,6,drug product control quality process material
2,0.014287,0.014291,0.323193,0.014288,0.014286,0.014288,0.014291,0.014286,0.562503,0.014287,8,component container device intended lot
3,0.014286,0.01429,0.014288,0.014286,0.014287,0.014286,0.01429,0.014286,0.871415,0.014286,8,component container device intended lot
4,0.004764,0.138245,0.004763,0.004763,0.004763,0.004764,0.004763,0.195332,0.633082,0.004763,8,component container device intended lot


In [8]:
bow2 = CountVectorizer(stop_words = 'english',
                       min_df = 8, 
                       max_df = 25, 
                       ngram_range = (1,2),
                       max_features = 10000)

documents_bow2 = bow2.fit_transform(data['hc_desc_cleaned'])

scores2 = []
for k in range(1, 11):
    lda = LatentDirichletAllocation(n_components=k, random_state = 100)
    lda.fit(documents_bow2)
    scores2.append(lda.score(documents_bow2)) # loglikelihood score
    
max_score2 = np.argmax(scores2)
num_topics2 = max_score2 + 1

lda2 = LatentDirichletAllocation(n_components = num_topics2, random_state = 100) 
lda_results2 = lda2.fit_transform(documents_bow2) 
lda_results2 = pd.DataFrame(lda_results2)
topic_lst2 = show_topics(lda2.components_, bow2.get_feature_names())

lda_results2['topic_number'] = lda_results2.idxmax(axis=1)
lda_results2['topic'] = ''
for i in range(0,len(lda_results2)):
    x = lda_results2.topic_number[i]
    lda_results2['topic'][i] = topic_lst2[x]

lda_results2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,topic_number,topic
0,0.007693,0.007692,0.325079,0.007693,0.613381,0.007693,0.007692,0.007692,0.007692,0.007692,4,material packaging batch lot batch lot
1,0.007693,0.007692,0.325079,0.007693,0.613381,0.007693,0.007692,0.007692,0.007692,0.007692,4,material packaging batch lot batch lot
2,0.007693,0.007692,0.325079,0.007693,0.613381,0.007693,0.007692,0.007692,0.007692,0.007692,4,material packaging batch lot batch lot
3,0.007143,0.007143,0.524134,0.007143,0.418721,0.007143,0.007143,0.007144,0.007143,0.007143,2,contamination drug contamination fabricator ac...
4,0.010002,0.010003,0.778668,0.010002,0.141321,0.010001,0.01,0.010001,0.01,0.010001,2,contamination drug contamination fabricator ac...


In [9]:
lda_results1.topic[25]

'prescription date food drug food number'

In [10]:
lda_results2.topic[25]

'ingredient human period form year'

In [11]:
sent1=lda_results1.topic[16]
sent2=lda_results2.topic[16]
enc_sentences = embedder.encode([sent1,sent2])
cosine_sim(enc_sentences[0:1],enc_sentences)[0,1]

0.5598774

In [12]:
sent1=lda_results1.topic[0]
sent2=lda_results2.topic[0]
enc_sentences = embedder.encode([sent1,sent2])
cosine_sim(enc_sentences[0:1],enc_sentences)[0,1]

0.7286489

In [13]:
data['topic_score'] = ''
data['topic_match']=''

for i in range(0,len(data)):
    sent1=lda_results1.topic[i]
    sent2=lda_results2.topic[i]
    enc_sentences = embedder.encode([sent1,sent2])
    score = cosine_sim(enc_sentences[0:1],enc_sentences)[0,1]
    data['topic_score'][i] = score
    if score >=0.7:
        data['topic_match'][i] = 'Y'
    else:
        data['topic_match'][i] = 'N'

In [14]:
data['cm_topic']=''
for k in range(0,len(data)):
    if (data['truth'][k] == 'Y') and (data['topic_match'][k] == 'Y'):
        data['cm_topic'][k] = 'TP'
    elif (data['truth'][k] == 'N') and (data['topic_match'][k] == 'Y'):
        data['cm_topic'][k] = 'FP'
    elif (data['truth'][k] == 'Y') and (data['topic_match'][k] == 'N'):
        data['cm_topic'][k] = 'FN'
    else:
        data['cm_topic'][k] = 'TN'
 
print('---------- LDA results ---------- ')
tp = data.cm_topic.value_counts().TP if 'TP' in data.cm_topic.value_counts() is not None else 0
print('true positives:',tp)
tn = data.cm_topic.value_counts().TN if 'TN' in data.cm_topic.value_counts() is not None else 0
print('true negatives:',tn)
fp = data.cm_topic.value_counts().FP if 'FP' in data.cm_topic.value_counts() is not None else 0
print('false positives:',fp)
fn = data.cm_topic.value_counts().FN if 'FN' in data.cm_topic.value_counts() is not None else 0
print('false negatives:',fn)

precision = (tp/(tp+fp)) if (tp+fp) is not 0 else 0
print('precision:',precision)

specificity = (tn / (tn + fp)) if (tn + fp) is not 0 else 0
print('specificity:',specificity)
    
recall = (tp/(tp+fn)) if (tp+fn) is not 0 else 0
print('recall:',recall)   

acc = ((tp+tn)/(tp+tn+fp+fn)) if (tp+tn+fp+fn) is not 0 else 0
print('accuracy:',acc)


---------- LDA results ---------- 
true positives: 3
true negatives: 68
false positives: 1
false negatives: 16
precision: 0.75
specificity: 0.9855072463768116
recall: 0.15789473684210525
accuracy: 0.8068181818181818


In [15]:
data['FDA_Description'][0]

'(a) Any building used in the manufacture, processing, packing, or holding of a drug product shall be maintained in a clean and sanitary condition, Any such building shall be free of infestation by rodents, birds, insects, and other vermin (other than laboratory animals). Trash and organic waste matter shall be held and disposed of in a timely and sanitary manner.  (b) There shall be written procedures assigning responsibility for sanitation and describing in sufficient detail the cleaning schedules, methods, equipment, and materials to be used in cleaning the buildings and facilities; such written procedures shall be followed.  (c) There shall be written procedures for use of suitable rodenticides, insecticides, fungicides, fumigating agents, and cleaning and sanitizing agents. Such written procedures shall be designed to prevent the contamination of equipment, components, drug product containers, closures, packaging, labeling materials, or drug products and shall be followed. Rodenti

In [16]:
lda_results1.topic[0]

'component container device intended lot'

In [17]:
data['HC_Description'][0]

'The premises in which a lot or batch of a drug is fabricated, packaged/labelled or stored shall be designed, constructed and maintained in a manner that (a) permits the operations therein to be performed under clean, sanitary and orderly conditions; (b) permits the effective cleaning of all surfaces therein; and (c) prevents the contamination of the drug and the addition of extraneous material to the drug. '

In [18]:
lda_results2.topic[0]

'material packaging batch lot batch lot'

In [19]:
data['FDA_Description'][1]

'Buildings in which Type A medicated article(s) are manufactured, processed, packaged, labeled, or held shall be maintained in a clear and orderly manner and shall be of suitable size, construction and location in relation to surroundings to facilitate maintenance and operation for their intended purpose. The building shall:  (a) Provide adequate space for the orderly placement of equipment and materials used in any of the following operations for which they are employed to minimize risk of mixups between different Type A medicated article(s), their components, packaging, or labeling:  (1) The receipt, sampling, control, and storage of components.  (2) Manufacturing and processing operations performed on the Type A medicated article(s).  (3) Packaging and labeling operations.  (4) Storage of containers, packaging materials, labeling, and finished products.  (5) Control laboratory operations.  (b) Provide adequate lighting and ventilation, and when necessary for the intended production 

In [20]:
lda_results1.topic[1]

'drug product control quality process material'

In [21]:
data['HC_Description'][1]

'The premises in which a lot or batch of a drug is fabricated, packaged/labelled or stored shall be designed, constructed and maintained in a manner that (a) permits the operations therein to be performed under clean, sanitary and orderly conditions; (b) permits the effective cleaning of all surfaces therein; and (c) prevents the contamination of the drug and the addition of extraneous material to the drug. '

In [22]:
lda_results2.topic[1]

'material packaging batch lot batch lot'

In [23]:
data.to_excel('topic_modelling_predictions.xlsx',index=False)