In [None]:
# # uncomment and run to install any required modules from requirements.txt
# import sys
# !{sys.executable} -m pip install -r requirements.txt

# NP-FAERS model train and test
This notebook uses the natural products annotation corpus to train and test a Siamese recurrent network that maps strings in FAERS to standardized natural product names.

# Load data

In [1]:
import numpy as np
import pandas as pd
import pickle
#import matplotlib.pyplot as plt
#import seaborn as sns
from IPython.display import display, clear_output
import re

## Preprocess Data

1. Remove digits and punctuation
2. Upper case
3. Mutating strings (not in this version)

In [2]:
#if starting from preprocess, load the drugs directly
df = pd.read_csv('data/NP_FAERS_mapped_indications_join_on_drugname_20220218.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8274 entries, 0 to 8273
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  8274 non-null   object
 1   lookup_value      8274 non-null   object
 2   indi_pt           4930 non-null   object
dtypes: object(3)
memory usage: 194.0+ KB


In [3]:
#make all casing upper
df['FAERS_drug_match'] = df['FAERS_drug_match'].str.upper()
df['lookup_value'] = df['lookup_value'].str.upper()
df['indi_pt'] = df['indi_pt'].str.upper()

In [4]:
df.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt
0,"ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, R...",ASHWAGANDA,
1,ASHWAGANDHA,ASHWAGANDA,PRODUCT USED FOR UNKNOWN INDICATION
2,ASHWAGANDHA,ASHWAGANDA,ROUTINE HEALTH MAINTENANCE
3,ASHWAGANDHA,ASHWAGANDA,STRESS
4,ASHWAGANDHA,ASHWAGANDA,ARTHROPATHY


In [5]:
df.lookup_value.unique().shape

(126,)

# RNN character-based encoder - Siamese Network

## Experiments
1. Unique natural product strings 
2. Unique natural product string - negative pairs random also
3. Unique natural product strings with indications
4. Mutations?

### Set up positive pairs and negative pairs  
  - positive pairs coded 'yes' in annotation corpus
  - negative pairs coded 'no' in annotation corpus and random from train data

In [7]:
import tensorflow as tf
import string
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tqdm

## Functions to process data

In [8]:
# how i add noise 
# string w 
# proportion of noise added 
# uniform random from [0,1]
# if <1/3 edit one position with new random character, else if <2/3 delete one position, else add one random character 
def add_noise(w, percent):
  ''' edit, del, add'''
  positions = random.choices(range(len(w)), k=int(percent*len(w)))
  for p in positions:
    r = random.random()
    if r <= 0.3333: # edit
      w = w[:p] + random.choice(string.ascii_uppercase) + w[p+1:]
    elif r<= 0.6667: # delete
      w = w[:p] + w[p+1:]
    elif r<=1: # add
      w = w[:p] + random.choice(string.ascii_uppercase) + w[p:]
  return w

def clean(text):
    #remove all non-ascii, special characters and keep alphabets and space only. Can also use isalpha()
    #convert to uppercase
    #remove extra spaces
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [9]:
def clean_dataset(data):
  x = []
  y = []
  for i in range(data.shape[0]):
    w = clean(data.FAERS_drug_match_indi.iloc[i])
    v = clean(data.lookup_value.iloc[i])
    x.append(w)
    y.append(v)
  return x,y

def encode_dataset(x,y):
  encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}
  Xtrain = [[encode_dict[m] for m in n] for n in x]
  Ytrain = [[encode_dict[m] for m in n] for n in y]
  return Xtrain, Ytrain

def clean_encode_padding(q, maxlen):
  q = clean(q)
  encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}
  return tf.keras.preprocessing.sequence.pad_sequences(
    [encode_dict[m] for m in q] , padding="post", maxlen=maxlen)

def padding_dataset(X,Y,maxlen):
  padded_y = tf.keras.preprocessing.sequence.pad_sequences(
    Y, padding="post", maxlen=maxlen)
  padded_x = tf.keras.preprocessing.sequence.pad_sequences(
      X, padding="post", maxlen=maxlen)
  return padded_x, padded_y


In [10]:
def cosine_distance(vects):
    x, y = vects
    return 1-tf.reduce_sum(tf.multiply(x,y),axis=1, keepdims=True)/(tf.norm(x,axis=1,keepdims=True)*tf.norm(y,axis=1,keepdims=True))

def loss(margin=1):
    def contrastive_loss(y_true, y_pred):
        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss

In [11]:
def plt_metric(history, metric, title, has_valid=True):
    """Plots the given 'metric' from 'history'.

    Arguments:
        history: history attribute of History object returned from Model.fit.
        metric: Metric to plot, a string value present as key in 'history'.
        title: A string to be used as title of plot.
        has_valid: Boolean, true if valid data was passed to Model.fit else false.

    Returns:
        None.
    """
    plt.plot(history[metric])
    if has_valid:
        plt.plot(history["val_" + metric])
        plt.legend(["train", "validation"], loc="upper left")
    plt.title(title)
    plt.ylabel(metric)
    plt.xlabel("epoch")
    plt.show()


## Experiment 3 - combined negative pairs from random train and reference set, add indications data joined on FAERS drug match for both positive and negative pairs

In [12]:
df.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt
0,"ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, R...",ASHWAGANDA,
1,ASHWAGANDHA,ASHWAGANDA,PRODUCT USED FOR UNKNOWN INDICATION
2,ASHWAGANDHA,ASHWAGANDA,ROUTINE HEALTH MAINTENANCE
3,ASHWAGANDHA,ASHWAGANDA,STRESS
4,ASHWAGANDHA,ASHWAGANDA,ARTHROPATHY


In [13]:
#append indications string to drug name (if not NA)
df['indi_pt'] = df['indi_pt'].fillna('')
df.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt
0,"ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, R...",ASHWAGANDA,
1,ASHWAGANDHA,ASHWAGANDA,PRODUCT USED FOR UNKNOWN INDICATION
2,ASHWAGANDHA,ASHWAGANDA,ROUTINE HEALTH MAINTENANCE
3,ASHWAGANDHA,ASHWAGANDA,STRESS
4,ASHWAGANDHA,ASHWAGANDA,ARTHROPATHY


In [14]:
df['FAERS_drug_match_indi'] = df['FAERS_drug_match'] + ' ' + df['indi_pt']


In [15]:
df.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt,FAERS_drug_match_indi
0,"ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, R...",ASHWAGANDA,,"ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, R..."
1,ASHWAGANDHA,ASHWAGANDA,PRODUCT USED FOR UNKNOWN INDICATION,ASHWAGANDHA PRODUCT USED FOR UNKNOWN INDICATION
2,ASHWAGANDHA,ASHWAGANDA,ROUTINE HEALTH MAINTENANCE,ASHWAGANDHA ROUTINE HEALTH MAINTENANCE
3,ASHWAGANDHA,ASHWAGANDA,STRESS,ASHWAGANDHA STRESS
4,ASHWAGANDHA,ASHWAGANDA,ARTHROPATHY,ASHWAGANDHA ARTHROPATHY


In [16]:
x = df.FAERS_drug_match_indi.str.len()
len(x)

8274

In [17]:
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [18]:
train.shape

(6619, 4)

In [19]:
test.shape

(1655, 4)

In [20]:
#padding length = maxlen
maxlen = 400

In [21]:
xtest, ytest = clean_dataset(test)
Xtest, Ytest = encode_dataset(xtest,ytest)
padded_xTest, padded_yTest = padding_dataset(Xtest,Ytest,maxlen)

In [22]:
padded_xTest.shape

(1655, 400)

In [23]:
#without noise (Experiment 1)
x, y = clean_dataset(train)

In [24]:
len(y)

6619

In [25]:
encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}

In [26]:
x_all, y_all = clean_dataset(df)
len(y_all)

8274

In [27]:
Xtrain = [[encode_dict[m] for m in n] for n in x]
Ytrain = [[encode_dict[m] for m in n] for n in y]

In [28]:
print(len(Xtrain))
print(len(Ytrain))
np.unique(Ytrain).shape

6619
6619


  ar = np.asanyarray(ar)


(124,)

In [29]:
padded_y = tf.keras.preprocessing.sequence.pad_sequences(
    Ytrain, padding="post", maxlen=maxlen
)
padded_x = tf.keras.preprocessing.sequence.pad_sequences(
    Xtrain, padding="post", maxlen=maxlen
)

print(padded_y)

[[ 1 12 12 ...  0  0  0]
 [12  9 14 ...  0  0  0]
 [15  5 14 ...  0  0  0]
 ...
 [ 3  1 14 ...  0  0  0]
 [ 7  9 14 ...  0  0  0]
 [20 18  9 ...  0  0  0]]


In [30]:
len(encode_dict)

27

In [31]:
dUnique_df = pd.DataFrame(columns = ['dUnique_label','dUnique_seq', 'dUnique_seq_padded'])
dUnique_df['dUnique_label'] = np.unique(y_all)
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,,
1,AESCULUS HIPPOCASTANUM,,
2,ALLIUM SATIVUM,,
3,ALOE VERA,,
4,ANGELICA SINENSIS,,


In [32]:
dUnique_seq_list = [[encode_dict[m] for m in n] for n in dUnique_df['dUnique_label'].tolist()]
len(dUnique_seq_list)

126

In [33]:
for i in range(len(dUnique_df.index)):
    dUnique_df.at[i, 'dUnique_seq'] = np.array(dUnique_seq_list[i])
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,"[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1...",
1,AESCULUS HIPPOCASTANUM,"[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...",
2,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...",
3,ALOE VERA,"[1, 12, 15, 5, 27, 22, 5, 18, 1]",
4,ANGELICA SINENSIS,"[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...",


In [34]:
#setup negative pairs - with indications data 
dfneg = pd.read_csv('data/NP_FAERS_mapped_indications_join_on_drugname_negative_20220223.csv')
dfneg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13843 entries, 0 to 13842
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  13843 non-null  object
 1   lookup_value      13843 non-null  object
 2   indi_pt           8808 non-null   object
dtypes: object(3)
memory usage: 324.6+ KB


In [35]:
#do same processing for negative pairs with indications


In [36]:
#append indications string to drug name (if not NA)
dfneg['indi_pt'] = dfneg['indi_pt'].fillna('')
dfneg.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt
0,"ANUSOL (BENZYL BENZOATE, BISMUTH OXIDE, BISMUT...",Cinnamon,
1,ANUSOL (BENZYL BENZOATE/BISMUTH RESORCINOL/BIS...,Cinnamon,
2,"ANUSOL-HC (BALSAM PERU, BENZYL BENZOATE, BISMU...",Cinnamon,
3,ANUSOL-HC [BENZYL BENZOATE;BISMUTH HYDROXIDE;,Cinnamon,
4,"ANUSOL-HC (BENZYL BENZOATE, BISMUTH HYDROXIDE,...",Cinnamon,


In [37]:
dfneg['FAERS_drug_match_indi'] = dfneg['FAERS_drug_match'] + ' ' + dfneg['indi_pt']

In [38]:
dfneg.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt,FAERS_drug_match_indi
0,"ANUSOL (BENZYL BENZOATE, BISMUTH OXIDE, BISMUT...",Cinnamon,,"ANUSOL (BENZYL BENZOATE, BISMUTH OXIDE, BISMUT..."
1,ANUSOL (BENZYL BENZOATE/BISMUTH RESORCINOL/BIS...,Cinnamon,,ANUSOL (BENZYL BENZOATE/BISMUTH RESORCINOL/BIS...
2,"ANUSOL-HC (BALSAM PERU, BENZYL BENZOATE, BISMU...",Cinnamon,,"ANUSOL-HC (BALSAM PERU, BENZYL BENZOATE, BISMU..."
3,ANUSOL-HC [BENZYL BENZOATE;BISMUTH HYDROXIDE;,Cinnamon,,ANUSOL-HC [BENZYL BENZOATE;BISMUTH HYDROXIDE;
4,"ANUSOL-HC (BENZYL BENZOATE, BISMUTH HYDROXIDE,...",Cinnamon,,"ANUSOL-HC (BENZYL BENZOATE, BISMUTH HYDROXIDE,..."


In [39]:
x = dfneg.FAERS_drug_match_indi.str.len()
len(x)

13843

In [40]:
#encode negative pairs and add padding
xneg, yneg = clean_dataset(dfneg)
Xneg, Yneg = encode_dataset(xneg,yneg)
padded_xneg, padded_yneg = padding_dataset(Xneg,Yneg,maxlen)

In [41]:
dUnique = tf.keras.preprocessing.sequence.pad_sequences(
    list(dUnique_df['dUnique_seq']), padding="post", maxlen=maxlen)

In [42]:
train.head()

Unnamed: 0,FAERS_drug_match,lookup_value,indi_pt,FAERS_drug_match_indi
8118,GARLIC AND PARSLEY GELCAP,ALLIUM SATIVUM,BLOOD PRESSURE ABNORMAL,GARLIC AND PARSLEY GELCAP BLOOD PRESSURE ABNORMAL
4823,NATURES BOUNTY FISH FLAX AND BORAGE,LINUM USITATISSIMUM,,NATURES BOUNTY FISH FLAX AND BORAGE
7999,PRIMROSE OIL CRANBERR,OENOTHERA BIENNIS,,PRIMROSE OIL CRANBERR
5602,HORSE CHESTNUT (FOR BACK),AESCULUS HIPPOCASTANUM,,HORSE CHESTNUT (FOR BACK)
6901,DONGQUAI,ANGELICA SINENSIS,SUPPLEMENTATION THERAPY,DONGQUAI SUPPLEMENTATION THERAPY


In [43]:
np_unique = dUnique_df.dUnique_label.tolist()
len(np_unique)

126

In [44]:
#add positive pairs
x1TrainRNN = []
x2TrainRNN = []
yTrainRNN = []
for i in range(len(padded_x)):
    yTrainRNN.append(1)
    x1TrainRNN.append(padded_x[i])
    x2TrainRNN.append(padded_y[i])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

6619 6619 6619


In [45]:
train_res = train.reset_index()
train_res.head()

Unnamed: 0,index,FAERS_drug_match,lookup_value,indi_pt,FAERS_drug_match_indi
0,8118,GARLIC AND PARSLEY GELCAP,ALLIUM SATIVUM,BLOOD PRESSURE ABNORMAL,GARLIC AND PARSLEY GELCAP BLOOD PRESSURE ABNORMAL
1,4823,NATURES BOUNTY FISH FLAX AND BORAGE,LINUM USITATISSIMUM,,NATURES BOUNTY FISH FLAX AND BORAGE
2,7999,PRIMROSE OIL CRANBERR,OENOTHERA BIENNIS,,PRIMROSE OIL CRANBERR
3,5602,HORSE CHESTNUT (FOR BACK),AESCULUS HIPPOCASTANUM,,HORSE CHESTNUT (FOR BACK)
4,6901,DONGQUAI,ANGELICA SINENSIS,SUPPLEMENTATION THERAPY,DONGQUAI SUPPLEMENTATION THERAPY


In [46]:
#take negative pairs from training data
faers_match = []
lookup = []
for i in range(len(train_res)):
    np_name = train_res.at[i, 'FAERS_drug_match_indi']
    for j in random.choices(range(len(np_unique)), k=4):
        np_temp = np_unique[j]
        np_match = train_res.loc[train_res['FAERS_drug_match_indi'] == np_name].lookup_value.tolist()
        if np_temp not in np_match:
            faers_match.append(np_name)
            lookup.append(np_temp)
len(faers_match)

26141

In [47]:
dfneg2 = pd.DataFrame(columns=['FAERS_drug_match_indi', 'lookup_value'])
dfneg2['FAERS_drug_match_indi'] = faers_match
dfneg2['lookup_value'] = lookup
xneg2, yneg2 = clean_dataset(dfneg2)
Xneg2, Yneg2 = encode_dataset(xneg2,yneg2)
padded_xneg2, padded_yneg2 = padding_dataset(Xneg2,Yneg2,maxlen)

In [48]:
##add negative pairs from training data
for j in range(len(padded_xneg2)):
    yTrainRNN.append(0)
    x1TrainRNN.append(padded_xneg2[j])
    x2TrainRNN.append(padded_yneg2[j])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

32760 32760 32760


In [49]:
##add negative pairs from reference set
for j in range(len(padded_xneg)):
    yTrainRNN.append(0)
    x1TrainRNN.append(padded_xneg[j])
    x2TrainRNN.append(padded_yneg[j])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

46603 46603 46603


In [50]:
for i in range(len(dUnique_df.index)):
    dUnique_df.at[i, 'dUnique_seq_padded'] = dUnique[i]
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,"[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1...","[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1..."
1,AESCULUS HIPPOCASTANUM,"[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...","[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16..."
2,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...","[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2..."
3,ALOE VERA,"[1, 12, 15, 5, 27, 22, 5, 18, 1]","[1, 12, 15, 5, 27, 22, 5, 18, 1, 0, 0, 0, 0, 0..."
4,ANGELICA SINENSIS,"[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...","[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1..."


In [51]:
x1TrainRnnS, x1ValRnnS, x2TrainRnnS, x2ValRnnS, yTrainRnnS, yValRnnS = train_test_split(x1TrainRNN, x2TrainRNN, yTrainRNN, test_size=0.20, random_state=42)

### build model and train

In [52]:
def build_model2(model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin):
    input_x = tf.keras.layers.Input(maxlen)
    input_1 = tf.keras.layers.Input(maxlen)
    input_2 = tf.keras.layers.Input(maxlen)
    embedding = tf.keras.layers.Embedding(input_dim=28, output_dim=embedding_dim, mask_zero=True)
    x = embedding(input_x)
    
    if model_type == "lstm":
        x = tf.keras.layers.LSTM(num_rnn_node)(x)
    elif model_type=="gru":
        x = tf.keras.layers.GRU(num_rnn_node)(x)
 
    num = num_dense_node
    for _ in range(num_layer):
        x = tf.keras.layers.Dense(num, activation=activation_fn)(x)
        num /= 2
        
    embedding_network = tf.keras.Model(input_x, x)

    tower_1 = embedding_network(input_1)
    tower_2 = embedding_network(input_2)

    merge_layer = tf.keras.layers.Lambda(cosine_distance)([tower_1, tower_2])
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid")(merge_layer)
    contr = tf.keras.Model(inputs=[input_1, input_2], outputs=output_layer)
    
    if optimizer == "Adam":
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer =="RMSprop":                
        opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    contr.compile(loss=loss(margin= margin), optimizer=opt, metrics=["accuracy"])
    return contr

model = build_model2("lstm", 256, 512, 256, 1, "tanh", 2e-4, "Adam", 0.8)


2022-03-02 18:22:38.808037: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-02 18:22:39.780599: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9648 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:af:00.0, compute capability: 7.5


In [53]:
history = model.fit(x=[np.array(x1TrainRnnS), np.array(x2TrainRnnS)],
                         y=np.array(yTrainRnnS, dtype=np.float32),
                         epochs=30,
                         batch_size=4,
                         validation_data=([np.array(x1ValRnnS), np.array(x2ValRnnS)], np.array(yValRnnS, dtype=np.float32)))

Epoch 1/30


2022-03-02 18:22:57.925426: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8302


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [54]:
#save weights
model.save("exp3/alstm-22-0_with_indications.01.hdf5")
#model.save_weights("exp3/alstm-22-0_with_indications.01.hdf5")

In [56]:
model.load_weights("exp3/alstm-22-0_with_indications.01.hdf5")

### Load model, get predictions and evaluate siamese network

In [57]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 model (Functional)             (None, 256)          1713408     ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

In [58]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1655 entries, 586 to 2884
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   FAERS_drug_match       1655 non-null   object
 1   lookup_value           1655 non-null   object
 2   indi_pt                1655 non-null   object
 3   FAERS_drug_match_indi  1655 non-null   object
dtypes: object(4)
memory usage: 64.6+ KB


In [59]:
test_eval = pd.DataFrame(columns = ['FAERS_drug_match_indi', 'lookup_value', 'rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug', 'lookup_rank'])

In [60]:
test_res = test.reset_index()
test_res.head()

Unnamed: 0,index,FAERS_drug_match,lookup_value,indi_pt,FAERS_drug_match_indi
0,586,FLAXSEED OIL [LINUM USITATISSIMUM OIL],FLAX SEED,PROPHYLAXIS,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...
1,7013,CHORDYCEPS,OPHIOCORDYCEPS SINENSIS,,CHORDYCEPS
2,7716,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,VACCINIUM MACROCARPON,PRODUCT USED FOR UNKNOWN INDICATION,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...
3,970,GREEN TEA 150MG,GREEN TEA,,GREEN TEA 150MG
4,8173,ODOR FREE GARLIC?X,ALLIUM SATIVUM,,ODOR FREE GARLIC?X


In [62]:
#find ranks 1, 2 and 3 (can go upto any number n) from the predicted similarities for the test data
for i in range(padded_xTest.shape[0]):
    predicts = model.predict([np.tile(padded_xTest[i,:], (dUnique.shape[0],1)), dUnique])
    argsort = np.argsort(-predicts.flatten())
    test_eval.at[i, 'FAERS_drug_match_indi'] = test_res.at[i, 'FAERS_drug_match_indi']
    test_eval.at[i, 'lookup_value'] = test_res.at[i, 'lookup_value']
    test_eval.at[i, 'rank1_drug'] = dUnique_df.iloc[argsort[0]].dUnique_label
    test_eval.at[i, 'rank2_drug'] = dUnique_df.iloc[argsort[1]].dUnique_label
    test_eval.at[i, 'rank3_drug'] = dUnique_df.iloc[argsort[2]].dUnique_label
    test_eval.at[i, 'rank4_drug'] = dUnique_df.iloc[argsort[3]].dUnique_label
    test_eval.at[i, 'rank5_drug'] = dUnique_df.iloc[argsort[4]].dUnique_label
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,FENUGREEK,KRATOM,
1,CHORDYCEPS,OPHIOCORDYCEPS SINENSIS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,IVY LEAF,HEDERA HELIX,UNCARIA TOMENTOSA,
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,VACCINIUM MACROCARPON,VACCINIUM MACROCARPON,CRANBERRY,CINNAMON,MITRAGYNA SPECIOSA,CINNAMOMUM VERUM,
3,GREEN TEA 150MG,GREEN TEA,GREEN TEA,CAMELLIA SINENSIS,APPLE CIDER VINEGAR,CINNAMON,MALUS DOMESTICA,
4,ODOR FREE GARLIC?X,ALLIUM SATIVUM,GARLIC,ALLIUM SATIVUM,GREEN TEA,EQUISETUM HYEMALE,ZINGIBER OFFICINALE,


In [63]:
for i in range(len(test_eval.index)):
    lookup = test_eval.at[i, 'lookup_value']
    lookup_clean = clean(lookup)
    if lookup_clean == test_eval.at[i, 'rank1_drug']:
        test_eval.at[i, 'lookup_rank'] = 1
    elif lookup_clean == test_eval.at[i, 'rank2_drug']:
        test_eval.at[i, 'lookup_rank'] = 2
    elif lookup_clean == test_eval.at[i, 'rank3_drug']:
        test_eval.at[i, 'lookup_rank'] = 3
    elif lookup_clean == test_eval.at[i, 'rank4_drug']:
        test_eval.at[i, 'lookup_rank'] = 4
    elif lookup_clean == test_eval.at[i, 'rank5_drug']:
        test_eval.at[i, 'lookup_rank'] = 5
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,FENUGREEK,KRATOM,2
1,CHORDYCEPS,OPHIOCORDYCEPS SINENSIS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,IVY LEAF,HEDERA HELIX,UNCARIA TOMENTOSA,2
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,VACCINIUM MACROCARPON,VACCINIUM MACROCARPON,CRANBERRY,CINNAMON,MITRAGYNA SPECIOSA,CINNAMOMUM VERUM,1
3,GREEN TEA 150MG,GREEN TEA,GREEN TEA,CAMELLIA SINENSIS,APPLE CIDER VINEGAR,CINNAMON,MALUS DOMESTICA,1
4,ODOR FREE GARLIC?X,ALLIUM SATIVUM,GARLIC,ALLIUM SATIVUM,GREEN TEA,EQUISETUM HYEMALE,ZINGIBER OFFICINALE,2


In [64]:
test_eval.to_csv("evaluation/test_siamese_evaluation_lstm_model_with_indications.csv", index=False)

In [65]:
##add related mappings rank to test set evaluation


In [66]:
vocab = pd.read_csv('data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


In [67]:
test_eval = pd.read_csv('evaluation/test_siamese_evaluation_lstm_model_with_indications.csv')
test_eval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655 entries, 0 to 1654
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FAERS_drug_match_indi  1655 non-null   object 
 1   lookup_value           1655 non-null   object 
 2   rank1_drug             1655 non-null   object 
 3   rank2_drug             1655 non-null   object 
 4   rank3_drug             1655 non-null   object 
 5   rank4_drug             1655 non-null   object 
 6   rank5_drug             1655 non-null   object 
 7   lookup_rank            1585 non-null   float64
dtypes: float64(1), object(7)
memory usage: 103.6+ KB


In [68]:
test_eval['lookup_rank_related'] = np.nan
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,lookup_rank_related
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,FENUGREEK,KRATOM,2.0,
1,CHORDYCEPS,OPHIOCORDYCEPS SINENSIS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,IVY LEAF,HEDERA HELIX,UNCARIA TOMENTOSA,2.0,
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,VACCINIUM MACROCARPON,VACCINIUM MACROCARPON,CRANBERRY,CINNAMON,MITRAGYNA SPECIOSA,CINNAMOMUM VERUM,1.0,
3,GREEN TEA 150MG,GREEN TEA,GREEN TEA,CAMELLIA SINENSIS,APPLE CIDER VINEGAR,CINNAMON,MALUS DOMESTICA,1.0,
4,ODOR FREE GARLIC?X,ALLIUM SATIVUM,GARLIC,ALLIUM SATIVUM,GREEN TEA,EQUISETUM HYEMALE,ZINGIBER OFFICINALE,2.0,


In [69]:
#find related mappings to lookup value in predicted values 
for i in range(len(test_eval.index)):
    lookup = test_eval.at[i, 'lookup_value']
    lookup_rank = test_eval.at[i, 'lookup_rank']
    lookup_clean = clean(lookup)
    lb_res = vocab.loc[vocab['latin_binomial_clean'] == lookup_clean]
    common_res = vocab.loc[vocab['common_name_clean'] == lookup_clean]
    lookup_result = ''
    if len(lb_res) > 0:
        lookup_result = lb_res.common_name_clean.values[0]
    elif len(common_res) > 0:
        lookup_result = common_res.latin_binomial_clean.values[0]
    if lookup_result != '':
        if lookup_result == test_eval.at[i, 'rank1_drug']:
            test_eval.at[i, 'lookup_rank_related'] = 1
        elif lookup_result == test_eval.at[i, 'rank2_drug']:
            if lookup_rank:
                if lookup_rank > 2:
                    test_eval.at[i, 'lookup_rank_related'] = 2
                else:
                    test_eval.at[i, 'lookup_rank_related'] = lookup_rank
            elif np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 2
        elif lookup_result == test_eval.at[i, 'rank3_drug']:
            if lookup_rank:
                if lookup_rank > 3:
                    test_eval.at[i, 'lookup_rank_related'] = 3
                else:
                    test_eval.at[i, 'lookup_rank_related'] = lookup_rank
            elif np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 3
        elif lookup_result == test_eval.at[i, 'rank4_drug']:
            if lookup_rank:
                if lookup_rank > 4:
                    test_eval.at[i, 'lookup_rank_related'] = 4
                else:
                    test_eval.at[i, 'lookup_rank_related'] = lookup_rank
            elif np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 4
        elif lookup_result == test_eval.at[i, 'rank5_drug']:
            if np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 5
        
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,lookup_rank_related
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,FENUGREEK,KRATOM,2.0,1.0
1,CHORDYCEPS,OPHIOCORDYCEPS SINENSIS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,IVY LEAF,HEDERA HELIX,UNCARIA TOMENTOSA,2.0,1.0
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,VACCINIUM MACROCARPON,VACCINIUM MACROCARPON,CRANBERRY,CINNAMON,MITRAGYNA SPECIOSA,CINNAMOMUM VERUM,1.0,1.0
3,GREEN TEA 150MG,GREEN TEA,GREEN TEA,CAMELLIA SINENSIS,APPLE CIDER VINEGAR,CINNAMON,MALUS DOMESTICA,1.0,1.0
4,ODOR FREE GARLIC?X,ALLIUM SATIVUM,GARLIC,ALLIUM SATIVUM,GREEN TEA,EQUISETUM HYEMALE,ZINGIBER OFFICINALE,2.0,1.0


In [70]:
#post process to make lookup rank = lookup rank related if related is nan still
for i in range(len(test_eval.index)):
    lookup_rank = test_eval.at[i, 'lookup_rank']
    lookup_rank_related = test_eval.at[i, 'lookup_rank_related']
    if lookup_rank:
        if np.isnan(lookup_rank_related):
            test_eval.at[i, 'lookup_rank_related'] = lookup_rank
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,lookup_rank_related
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,FENUGREEK,KRATOM,2.0,1.0
1,CHORDYCEPS,OPHIOCORDYCEPS SINENSIS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,IVY LEAF,HEDERA HELIX,UNCARIA TOMENTOSA,2.0,1.0
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,VACCINIUM MACROCARPON,VACCINIUM MACROCARPON,CRANBERRY,CINNAMON,MITRAGYNA SPECIOSA,CINNAMOMUM VERUM,1.0,1.0
3,GREEN TEA 150MG,GREEN TEA,GREEN TEA,CAMELLIA SINENSIS,APPLE CIDER VINEGAR,CINNAMON,MALUS DOMESTICA,1.0,1.0
4,ODOR FREE GARLIC?X,ALLIUM SATIVUM,GARLIC,ALLIUM SATIVUM,GREEN TEA,EQUISETUM HYEMALE,ZINGIBER OFFICINALE,2.0,1.0


In [71]:
test_eval.to_csv('evaluation/test_siamese_evaluation_related_lstm_model_with_indications.csv', index=False)

In [2]:
###compute MRR from test set evaluation
test_eval = pd.read_csv('evaluation/test_siamese_evaluation_related_lstm_model_with_indications.csv')

In [72]:
test_mrr = test_eval[['FAERS_drug_match_indi', 'lookup_rank', 'lookup_rank_related']]
test_mrr.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_rank,lookup_rank_related
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,2.0,1.0
1,CHORDYCEPS,2.0,1.0
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,1.0,1.0
3,GREEN TEA 150MG,1.0,1.0
4,ODOR FREE GARLIC?X,2.0,1.0


In [73]:
test_mrr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655 entries, 0 to 1654
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FAERS_drug_match_indi  1655 non-null   object 
 1   lookup_rank            1585 non-null   float64
 2   lookup_rank_related    1589 non-null   float64
dtypes: float64(2), object(1)
memory usage: 38.9+ KB


In [74]:
#number of 0 relevant results
test_mrr.loc[test_mrr['lookup_rank'].isna()].shape

(70, 3)

In [75]:
test_mrr.loc[test_mrr['lookup_rank_related'].isna()].shape

(66, 3)

In [76]:
test_mrr_exact = test_mrr[test_mrr['lookup_rank'].notna()]
test_mrr_exact = test_mrr_exact.drop(['lookup_rank_related'], axis=1)
test_mrr_exact.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1585 entries, 0 to 1654
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FAERS_drug_match_indi  1585 non-null   object 
 1   lookup_rank            1585 non-null   float64
dtypes: float64(1), object(1)
memory usage: 37.1+ KB


In [77]:
test_mrr_rel = test_mrr[test_mrr['lookup_rank_related'].notna()]
test_mrr_rel = test_mrr_rel.drop(['lookup_rank'], axis=1)
test_mrr_rel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1589 entries, 0 to 1654
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FAERS_drug_match_indi  1589 non-null   object 
 1   lookup_rank_related    1589 non-null   float64
dtypes: float64(1), object(1)
memory usage: 37.2+ KB


In [78]:
exact_reciprocal = 1/test_mrr_exact['lookup_rank']
test_mrr_exact['reciprocal_rank'] = exact_reciprocal
test_mrr_exact.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_rank,reciprocal_rank
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,2.0,0.5
1,CHORDYCEPS,2.0,0.5
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,1.0,1.0
3,GREEN TEA 150MG,1.0,1.0
4,ODOR FREE GARLIC?X,2.0,0.5


In [79]:
##get the mean of reciprocal ranks for exact matches
test_mrr_exact.reciprocal_rank.mean()

0.7309989484752891

In [80]:
test_mrr_exact.reciprocal_rank.median()

0.5

In [81]:
test_mrr_exact.reciprocal_rank.std()

0.2555915467881875

In [82]:
rel_reciprocal = 1/test_mrr_rel['lookup_rank_related']
test_mrr_rel['reciprocal_rank'] = rel_reciprocal
test_mrr_rel.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_rank_related,reciprocal_rank
0,FLAXSEED OIL [LINUM USITATISSIMUM OIL] PROPHYL...,1.0,1.0
1,CHORDYCEPS,1.0,1.0
2,CRANBERRY PLUS [JUNIPERUS COMMUNIS;VACCINIUM M...,1.0,1.0
3,GREEN TEA 150MG,1.0,1.0
4,ODOR FREE GARLIC?X,1.0,1.0


In [83]:
test_mrr_rel.reciprocal_rank.mean()

0.9054122089364379

In [84]:
test_mrr_rel.reciprocal_rank.median()

1.0

In [85]:
test_mrr_rel.reciprocal_rank.std()

0.20205460446171805

### Scratch code blocks below

In [43]:
siamese_model.layers[0]

<keras.engine.input_layer.InputLayer at 0x7f7183cbd400>

In [74]:
##try to get embedding weights - this gives from original model not trained (I think)
embeddings_weights = embedding.get_weights()[0]

In [48]:
for layer in siamese_model.layers:
    print(layer)

<keras.engine.input_layer.InputLayer object at 0x7f7183cbd400>
<keras.engine.input_layer.InputLayer object at 0x7f7183cbd1c0>
<keras.engine.functional.Functional object at 0x7f7183cbd9a0>
<keras.layers.core.lambda_layer.Lambda object at 0x7f7183cbdaf0>
<keras.layers.core.dense.Dense object at 0x7f7170406730>
