In [1]:
import numpy as np
import pandas as pd
import pickle
#import matplotlib.pyplot as plt
#import seaborn as sns
from IPython.display import display, clear_output
import re
'''import skopt
from skopt import BayesSearchCV
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.plots import plot_histogram, plot_objective_2D
from skopt.utils import use_named_args'''
from tqdm import tqdm
#!pip install scikit-optimize

In [2]:
#if starting from preprocess, load the drugs directly
df = pd.read_csv('data/NP_FAERS_mapped_20220215.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB


In [3]:
#make all casing upper
df['FAERS_drug_match'] = df['FAERS_drug_match'].str.upper()
df['lookup_value'] = df['lookup_value'].str.upper()

In [4]:
import tensorflow as tf
import string
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [5]:
# how i add noise 
# string w 
# proportion of noise added 
# uniform random from [0,1]
# if <1/3 edit one position with new random character, else if <2/3 delete one position, else add one random character 
def add_noise(w, percent):
  ''' edit, del, add'''
  positions = random.choices(range(len(w)), k=int(percent*len(w)))
  for p in positions:
    r = random.random()
    if r <= 0.3333: # edit
      w = w[:p] + random.choice(string.ascii_uppercase) + w[p+1:]
    elif r<= 0.6667: # delete
      w = w[:p] + w[p+1:]
    elif r<=1: # add
      w = w[:p] + random.choice(string.ascii_uppercase) + w[p:]
  return w

def clean(text):
    #remove all non-ascii, special characters and keep alphabets and space only. Can also use isalpha()
    #convert to uppercase
    #remove extra spaces
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()
def clean_dataset(data):
  x = []
  y = []
  for i in range(data.shape[0]):
    w = clean(data.FAERS_drug_match.iloc[i])
    v = clean(data.lookup_value.iloc[i])
    x.append(w)
    y.append(v)
  return x,y

def encode_dataset(x,y):
  encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}
  Xtrain = [[encode_dict[m] for m in n] for n in x]
  Ytrain = [[encode_dict[m] for m in n] for n in y]
  return Xtrain, Ytrain

def clean_encode_padding(q, maxlen):
  q = clean(q)
  encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}
  return tf.keras.preprocessing.sequence.pad_sequences(
    [encode_dict[m] for m in q] , padding="post", maxlen=maxlen)

def padding_dataset(X,Y,maxlen):
  padded_y = tf.keras.preprocessing.sequence.pad_sequences(
    Y, padding="post", maxlen=maxlen)
  padded_x = tf.keras.preprocessing.sequence.pad_sequences(
      X, padding="post", maxlen=maxlen)
  return padded_x, padded_y

def cosine_distance(vects):
    x, y = vects
    return 1-tf.reduce_sum(tf.multiply(x,y),axis=1, keepdims=True)/(tf.norm(x,axis=1,keepdims=True)*tf.norm(y,axis=1,keepdims=True))

def loss(margin=1):
    def contrastive_loss(y_true, y_pred):
        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss

def plt_metric(history, metric, title, has_valid=True):
    """Plots the given 'metric' from 'history'.

    Arguments:
        history: history attribute of History object returned from Model.fit.
        metric: Metric to plot, a string value present as key in 'history'.
        title: A string to be used as title of plot.
        has_valid: Boolean, true if valid data was passed to Model.fit else false.

    Returns:
        None.
    """
    plt.plot(history[metric])
    if has_valid:
        plt.plot(history["val_" + metric])
        plt.legend(["train", "validation"], loc="upper left")
    plt.title(title)
    plt.ylabel(metric)
    plt.xlabel("epoch")
    plt.show()


In [6]:
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [7]:
train.shape

(4286, 2)

In [8]:
test.shape

(1072, 2)

In [9]:
#padding length = maxlen
maxlen = 400

In [10]:
xtest, ytest = clean_dataset(test)
Xtest, Ytest = encode_dataset(xtest,ytest)
padded_xTest, padded_yTest = padding_dataset(Xtest,Ytest,maxlen)

In [11]:
padded_xTest.shape

(1072, 400)

In [12]:
#without noise 
x, y = clean_dataset(train)

In [13]:
len(y)

4286

In [14]:
encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}

In [15]:
Xtrain = [[encode_dict[m] for m in n] for n in x]
Ytrain = [[encode_dict[m] for m in n] for n in y]

In [16]:
print(len(Xtrain))
print(len(Ytrain))
np.unique(Ytrain).shape

4286
4286


  ar = np.asanyarray(ar)


(125,)

In [17]:
padded_y = tf.keras.preprocessing.sequence.pad_sequences(
    Ytrain, padding="post", maxlen=maxlen
)
padded_x = tf.keras.preprocessing.sequence.pad_sequences(
    Xtrain, padding="post", maxlen=maxlen
)

print(padded_y)

[[19  5 18 ...  0  0  0]
 [15 16  8 ...  0  0  0]
 [ 3  9 14 ...  0  0  0]
 ...
 [ 1 12 12 ...  0  0  0]
 [ 1 12 12 ...  0  0  0]
 [ 8  5 13 ...  0  0  0]]


In [18]:
len(encode_dict)

27

In [19]:
dUnique_df = pd.DataFrame(columns = ['dUnique_label','dUnique_seq', 'dUnique_seq_padded'])
dUnique_df['dUnique_label'] = np.unique(y)
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,,
1,AESCULUS HIPPOCASTANUM,,
2,ALLIUM SATIVUM,,
3,ALOE VERA,,
4,ANGELICA SINENSIS,,


In [20]:
dUnique_seq_list = [[encode_dict[m] for m in n] for n in dUnique_df['dUnique_label'].tolist()]
len(dUnique_seq_list)

125

In [21]:
for i in range(len(dUnique_df.index)):
    dUnique_df.at[i, 'dUnique_seq'] = np.array(dUnique_seq_list[i])
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,"[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1...",
1,AESCULUS HIPPOCASTANUM,"[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...",
2,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...",
3,ALOE VERA,"[1, 12, 15, 5, 27, 22, 5, 18, 1]",
4,ANGELICA SINENSIS,"[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...",


In [22]:
#setup negative pairs
dfneg = pd.read_csv('data/NP_FAERS_negative_pairs_20220222.csv')
dfneg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9242 entries, 0 to 9241
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  9242 non-null   object
 1   lookup_value      9242 non-null   object
dtypes: object(2)
memory usage: 144.5+ KB


In [23]:
#encode negative pairs and add padding
xneg, yneg = clean_dataset(dfneg)
Xneg, Yneg = encode_dataset(xneg,yneg)
padded_xneg, padded_yneg = padding_dataset(Xneg,Yneg,maxlen)

In [22]:
dUnique = tf.keras.preprocessing.sequence.pad_sequences(
    list(dUnique_df['dUnique_seq']), padding="post", maxlen=maxlen)

In [23]:
np_unique = dUnique_df.dUnique_label.tolist()
len(np_unique)

125

In [26]:
#add positive pairs
x1TrainRNN = []
x2TrainRNN = []
yTrainRNN = []
for i in range(len(padded_x)):
    yTrainRNN.append(1)
    x1TrainRNN.append(padded_x[i])
    x2TrainRNN.append(padded_y[i])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

4286 4286 4286


In [27]:
train_res = train.reset_index()
train_res.head()

Unnamed: 0,index,FAERS_drug_match,lookup_value
0,3956,SERENOA REPENS/SERENOA REPENS EXTRACT/SERENOA ...,SERENOA REPENS
1,4408,CORDYCEPS,OPHIOCORDYCEPS SINENSIS
2,120,CINNAMON (CINNAMOUM VERUM) (CAPSULES),CINNAMON
3,3311,CHROMIUM PICOLINATE WITH GREEN TEA,CAMELLIA SINENSIS
4,1921,BARLEY.,BARLEY GRASS


In [28]:
#take negative pairs from training data
faers_match = []
lookup = []
for i in range(len(train_res)):
    np_name = train_res.at[i, 'FAERS_drug_match']
    for j in random.choices(range(len(np_unique)), k=4):
        np_temp = np_unique[j]
        np_match = train_res.loc[train_res['FAERS_drug_match'] == np_name].lookup_value.tolist()
        if np_temp not in np_match:
            faers_match.append(np_name)
            lookup.append(np_temp)
len(faers_match)

16916

In [29]:
dfneg2 = pd.DataFrame(columns=['FAERS_drug_match', 'lookup_value'])
dfneg2['FAERS_drug_match'] = faers_match
dfneg2['lookup_value'] = lookup
xneg2, yneg2 = clean_dataset(dfneg2)
Xneg2, Yneg2 = encode_dataset(xneg2,yneg2)
padded_xneg2, padded_yneg2 = padding_dataset(Xneg2,Yneg2,maxlen)

In [30]:
##add negative pairs from training data
for j in range(len(padded_xneg2)):
    yTrainRNN.append(0)
    x1TrainRNN.append(padded_xneg2[j])
    x2TrainRNN.append(padded_yneg2[j])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

21202 21202 21202


In [31]:
##add negative pairs from reference set
for j in range(len(padded_xneg)):
    yTrainRNN.append(0)
    x1TrainRNN.append(padded_xneg[j])
    x2TrainRNN.append(padded_yneg[j])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

30444 30444 30444


In [24]:
for i in range(len(dUnique_df.index)):
    dUnique_df.at[i, 'dUnique_seq_padded'] = dUnique[i]
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,"[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1...","[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1..."
1,AESCULUS HIPPOCASTANUM,"[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...","[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16..."
2,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...","[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2..."
3,ALOE VERA,"[1, 12, 15, 5, 27, 22, 5, 18, 1]","[1, 12, 15, 5, 27, 22, 5, 18, 1, 0, 0, 0, 0, 0..."
4,ANGELICA SINENSIS,"[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...","[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1..."


In [33]:
x1TrainRnnS, x1ValRnnS, x2TrainRnnS, x2ValRnnS, yTrainRnnS, yValRnnS = train_test_split(x1TrainRNN, x2TrainRNN, yTrainRNN, test_size=0.20, random_state=42)

## Build model, load weights and evaluate on test data

In [25]:
def build_model2(model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin):
    input_x = tf.keras.layers.Input(maxlen)
    input_1 = tf.keras.layers.Input(maxlen)
    input_2 = tf.keras.layers.Input(maxlen)
    embedding = tf.keras.layers.Embedding(input_dim=28, output_dim=embedding_dim, mask_zero=True)
    x = embedding(input_x)
    
    if model_type == "lstm":
        x = tf.keras.layers.LSTM(num_rnn_node)(x)
    elif model_type=="gru":
        x = tf.keras.layers.GRU(num_rnn_node)(x)
 
    num = num_dense_node
    for _ in range(num_layer):
        x = tf.keras.layers.Dense(num, activation=activation_fn)(x)
        num /= 2
        
    embedding_network = tf.keras.Model(input_x, x)

    tower_1 = embedding_network(input_1)
    tower_2 = embedding_network(input_2)

    merge_layer = tf.keras.layers.Lambda(cosine_distance)([tower_1, tower_2])
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid")(merge_layer)
    contr = tf.keras.Model(inputs=[input_1, input_2], outputs=output_layer)
    
    if optimizer == "Adam":
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer =="RMSprop":                
        opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    contr.compile(loss=loss(margin= margin), optimizer=opt, metrics=["accuracy"])
    return contr

model = build_model2("lstm", 256, 512, 256, 1, "tanh", 2e-4, "Adam", 0.8)
# history = model.fit(x=[np.array(x1TrainRnnS), np.array(x2TrainRnnS)],
#                         y=np.array(yTrainRnnS, dtype=np.float32),
#                         epochs=30,
#                         batch_size=4,
#                         validation_data=([np.array(x1ValRnnS), np.array(x2ValRnnS)], np.array(yValRnnS, dtype=np.float32)))

2022-03-02 17:52:44.638220: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-02 17:52:45.592280: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9648 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:af:00.0, compute capability: 7.5


In [26]:
model.load_weights("exp3/alstm-22-0.01.hdf5")

In [27]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 model (Functional)             (None, 256)          1713408     ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

In [None]:
##Evaluating on test data - NP names only

In [28]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1072 entries, 4003 to 1295
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  1072 non-null   object
 1   lookup_value      1072 non-null   object
dtypes: object(2)
memory usage: 25.1+ KB


In [29]:
test_eval = pd.DataFrame(columns = ['FAERS_drug_match', 'lookup_value', 'rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug', 'lookup_rank'])

In [30]:
test_res = test.reset_index()
test_res.head()

Unnamed: 0,index,FAERS_drug_match,lookup_value
0,4003,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),HYPERICUM PERFORATUM
1,1729,CORDYCEPS,CORDYCEPS
2,401,FLAXSEEDS OIL,FLAX SEED
3,1242,EXTRACT OF SERENOA REPENS (PERMIXON),SCRUB-PALMETTO
4,3406,GUARANA (GUARANA),PAULLINIA CUPANA


In [31]:
model.predict([np.tile(padded_xTest[0,:], (dUnique.shape[0],1)), dUnique])

2022-03-02 17:53:14.941308: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8302


array([[1.22989609e-03],
       [5.26903510e-01],
       [9.51801986e-03],
       [1.18690403e-03],
       [6.57827943e-04],
       [9.00198240e-03],
       [3.18171486e-04],
       [7.94509426e-03],
       [9.94353276e-03],
       [3.62111605e-03],
       [2.87783612e-03],
       [3.52321588e-03],
       [6.80907280e-04],
       [1.22805266e-02],
       [1.92781873e-02],
       [2.09858920e-02],
       [3.50466222e-02],
       [7.81174749e-04],
       [7.67544890e-03],
       [1.16587825e-01],
       [9.83013958e-03],
       [2.90058404e-02],
       [2.31919102e-02],
       [1.90664455e-02],
       [4.65164240e-03],
       [4.86525148e-03],
       [1.06993485e-02],
       [3.23094870e-03],
       [1.07805850e-02],
       [3.49118002e-02],
       [3.32882884e-03],
       [9.33429459e-04],
       [6.83172839e-03],
       [5.59045514e-03],
       [1.38394041e-02],
       [1.05949091e-02],
       [1.83650907e-02],
       [1.59396641e-02],
       [1.01413336e-02],
       [1.18392222e-02],


In [34]:
#find ranks 1, 2 and 3 (can go upto any number n) from the predicted similarities for the test data
for i in range(padded_xTest.shape[0]):
    predicts = model.predict([np.tile(padded_xTest[i,:], (dUnique.shape[0],1)), dUnique])
    argsort = np.argsort(-predicts.flatten())
    test_eval.at[i, 'FAERS_drug_match'] = test_res.at[i, 'FAERS_drug_match']
    test_eval.at[i, 'lookup_value'] = test_res.at[i, 'lookup_value']
    test_eval.at[i, 'rank1_drug'] = dUnique_df.iloc[argsort[0]].dUnique_label
    test_eval.at[i, 'rank2_drug'] = dUnique_df.iloc[argsort[1]].dUnique_label
    test_eval.at[i, 'rank3_drug'] = dUnique_df.iloc[argsort[2]].dUnique_label
    test_eval.at[i, 'rank4_drug'] = dUnique_df.iloc[argsort[3]].dUnique_label
    test_eval.at[i, 'rank5_drug'] = dUnique_df.iloc[argsort[4]].dUnique_label
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,FAERS_drug_match
0,,HYPERICUM PERFORATUM,HYPERICUM PERFORATUM,ST JOHNSWORT,AESCULUS HIPPOCASTANUM,HORSECHESTNUT,CATSCLAW,,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM)
1,,CORDYCEPS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,KAVA,PIPER METHYSTICUM,BUTCHERSBROOM,,CORDYCEPS
2,,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,OENOTHERA BIENNIS,PANAX GINSENG,,FLAXSEEDS OIL
3,,SCRUB-PALMETTO,SERENOA REPENS,SCRUBPALMETTO,ST JOHNSWORT,HYPERICUM PERFORATUM,PANAX GINSENG,,EXTRACT OF SERENOA REPENS (PERMIXON)
4,,PAULLINIA CUPANA,PAULLINIA CUPANA,GUARANA,KAVA,ARCTIUM LAPPA,PIPER METHYSTICUM,,GUARANA (GUARANA)


In [35]:
for i in range(len(test_eval.index)):
    lookup = test_eval.at[i, 'lookup_value']
    lookup_clean = clean(lookup)
    if lookup_clean == test_eval.at[i, 'rank1_drug']:
        test_eval.at[i, 'lookup_rank'] = 1
    elif lookup_clean == test_eval.at[i, 'rank2_drug']:
        test_eval.at[i, 'lookup_rank'] = 2
    elif lookup_clean == test_eval.at[i, 'rank3_drug']:
        test_eval.at[i, 'lookup_rank'] = 3
    elif lookup_clean == test_eval.at[i, 'rank4_drug']:
        test_eval.at[i, 'lookup_rank'] = 4
    elif lookup_clean == test_eval.at[i, 'rank5_drug']:
        test_eval.at[i, 'lookup_rank'] = 5
test_eval.head()

Unnamed: 0,FAERS_drug_match_indi,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,FAERS_drug_match
0,,HYPERICUM PERFORATUM,HYPERICUM PERFORATUM,ST JOHNSWORT,AESCULUS HIPPOCASTANUM,HORSECHESTNUT,CATSCLAW,1,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM)
1,,CORDYCEPS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,KAVA,PIPER METHYSTICUM,BUTCHERSBROOM,1,CORDYCEPS
2,,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,OENOTHERA BIENNIS,PANAX GINSENG,2,FLAXSEEDS OIL
3,,SCRUB-PALMETTO,SERENOA REPENS,SCRUBPALMETTO,ST JOHNSWORT,HYPERICUM PERFORATUM,PANAX GINSENG,2,EXTRACT OF SERENOA REPENS (PERMIXON)
4,,PAULLINIA CUPANA,PAULLINIA CUPANA,GUARANA,KAVA,ARCTIUM LAPPA,PIPER METHYSTICUM,1,GUARANA (GUARANA)


In [36]:
test_eval.to_csv("evaluation/test_siamese_evaluation_lstm_model_np_name.csv", index=False)

In [37]:
##add related mappings rank to test set evaluation

In [38]:
vocab = pd.read_csv('data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


In [39]:
test_eval = pd.read_csv('evaluation/test_siamese_evaluation_lstm_model_np_name.csv')
test_eval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1072 entries, 0 to 1071
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FAERS_drug_match_indi  0 non-null      float64
 1   lookup_value           1072 non-null   object 
 2   rank1_drug             1072 non-null   object 
 3   rank2_drug             1072 non-null   object 
 4   rank3_drug             1072 non-null   object 
 5   rank4_drug             1072 non-null   object 
 6   rank5_drug             1072 non-null   object 
 7   lookup_rank            1022 non-null   float64
 8   FAERS_drug_match       1072 non-null   object 
dtypes: float64(2), object(7)
memory usage: 75.5+ KB


In [40]:
test_eval = test_eval.drop(['FAERS_drug_match_indi'], axis=1)

In [41]:
test_eval['lookup_rank_related'] = np.nan
test_eval.head()

Unnamed: 0,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,FAERS_drug_match,lookup_rank_related
0,HYPERICUM PERFORATUM,HYPERICUM PERFORATUM,ST JOHNSWORT,AESCULUS HIPPOCASTANUM,HORSECHESTNUT,CATSCLAW,1.0,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),
1,CORDYCEPS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,KAVA,PIPER METHYSTICUM,BUTCHERSBROOM,1.0,CORDYCEPS,
2,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,OENOTHERA BIENNIS,PANAX GINSENG,2.0,FLAXSEEDS OIL,
3,SCRUB-PALMETTO,SERENOA REPENS,SCRUBPALMETTO,ST JOHNSWORT,HYPERICUM PERFORATUM,PANAX GINSENG,2.0,EXTRACT OF SERENOA REPENS (PERMIXON),
4,PAULLINIA CUPANA,PAULLINIA CUPANA,GUARANA,KAVA,ARCTIUM LAPPA,PIPER METHYSTICUM,1.0,GUARANA (GUARANA),


In [42]:
#find related mappings to lookup value in predicted values 
for i in range(len(test_eval.index)):
    lookup = test_eval.at[i, 'lookup_value']
    lookup_rank = test_eval.at[i, 'lookup_rank']
    lookup_clean = clean(lookup)
    lb_res = vocab.loc[vocab['latin_binomial_clean'] == lookup_clean]
    common_res = vocab.loc[vocab['common_name_clean'] == lookup_clean]
    lookup_result = ''
    if len(lb_res) > 0:
        lookup_result = lb_res.common_name_clean.values[0]
    elif len(common_res) > 0:
        lookup_result = common_res.latin_binomial_clean.values[0]
    if lookup_result != '':
        if lookup_result == test_eval.at[i, 'rank1_drug']:
            test_eval.at[i, 'lookup_rank_related'] = 1
        elif lookup_result == test_eval.at[i, 'rank2_drug']:
            if lookup_rank:
                if lookup_rank > 2:
                    test_eval.at[i, 'lookup_rank_related'] = 2
                else:
                    test_eval.at[i, 'lookup_rank_related'] = lookup_rank
            elif np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 2
        elif lookup_result == test_eval.at[i, 'rank3_drug']:
            if lookup_rank:
                if lookup_rank > 3:
                    test_eval.at[i, 'lookup_rank_related'] = 3
                else:
                    test_eval.at[i, 'lookup_rank_related'] = lookup_rank
            elif np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 3
        elif lookup_result == test_eval.at[i, 'rank4_drug']:
            if lookup_rank:
                if lookup_rank > 4:
                    test_eval.at[i, 'lookup_rank_related'] = 4
                else:
                    test_eval.at[i, 'lookup_rank_related'] = lookup_rank
            elif np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 4
        elif lookup_result == test_eval.at[i, 'rank5_drug']:
            if np.isnan(lookup_rank):
                test_eval.at[i, 'lookup_rank_related'] = 5
        
test_eval.head()

Unnamed: 0,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,FAERS_drug_match,lookup_rank_related
0,HYPERICUM PERFORATUM,HYPERICUM PERFORATUM,ST JOHNSWORT,AESCULUS HIPPOCASTANUM,HORSECHESTNUT,CATSCLAW,1.0,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),
1,CORDYCEPS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,KAVA,PIPER METHYSTICUM,BUTCHERSBROOM,1.0,CORDYCEPS,1.0
2,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,OENOTHERA BIENNIS,PANAX GINSENG,2.0,FLAXSEEDS OIL,1.0
3,SCRUB-PALMETTO,SERENOA REPENS,SCRUBPALMETTO,ST JOHNSWORT,HYPERICUM PERFORATUM,PANAX GINSENG,2.0,EXTRACT OF SERENOA REPENS (PERMIXON),1.0
4,PAULLINIA CUPANA,PAULLINIA CUPANA,GUARANA,KAVA,ARCTIUM LAPPA,PIPER METHYSTICUM,1.0,GUARANA (GUARANA),1.0


In [43]:
#post process to make lookup rank = lookup rank related if related is nan still
for i in range(len(test_eval.index)):
    lookup_rank = test_eval.at[i, 'lookup_rank']
    lookup_rank_related = test_eval.at[i, 'lookup_rank_related']
    if lookup_rank:
        if np.isnan(lookup_rank_related):
            test_eval.at[i, 'lookup_rank_related'] = lookup_rank
test_eval.head()

Unnamed: 0,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,FAERS_drug_match,lookup_rank_related
0,HYPERICUM PERFORATUM,HYPERICUM PERFORATUM,ST JOHNSWORT,AESCULUS HIPPOCASTANUM,HORSECHESTNUT,CATSCLAW,1.0,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),1.0
1,CORDYCEPS,CORDYCEPS,OPHIOCORDYCEPS SINENSIS,KAVA,PIPER METHYSTICUM,BUTCHERSBROOM,1.0,CORDYCEPS,1.0
2,FLAX SEED,LINUM USITATISSIMUM,FLAX SEED,EVENING PRIMROSE OIL,OENOTHERA BIENNIS,PANAX GINSENG,2.0,FLAXSEEDS OIL,1.0
3,SCRUB-PALMETTO,SERENOA REPENS,SCRUBPALMETTO,ST JOHNSWORT,HYPERICUM PERFORATUM,PANAX GINSENG,2.0,EXTRACT OF SERENOA REPENS (PERMIXON),1.0
4,PAULLINIA CUPANA,PAULLINIA CUPANA,GUARANA,KAVA,ARCTIUM LAPPA,PIPER METHYSTICUM,1.0,GUARANA (GUARANA),1.0


In [44]:
test_eval.to_csv('evaluation/test_siamese_evaluation_related_lstm_model_np_name.csv', index=False)

In [2]:
###compute MRR from test set evaluation
test_eval = pd.read_csv('evaluation/test_siamese_evaluation_related_lstm_model_np_name.csv')

In [45]:
test_mrr = test_eval[['FAERS_drug_match', 'lookup_rank', 'lookup_rank_related']]
test_mrr.head()

Unnamed: 0,FAERS_drug_match,lookup_rank,lookup_rank_related
0,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),1.0,1.0
1,CORDYCEPS,1.0,1.0
2,FLAXSEEDS OIL,2.0,1.0
3,EXTRACT OF SERENOA REPENS (PERMIXON),2.0,1.0
4,GUARANA (GUARANA),1.0,1.0


In [46]:
test_mrr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1072 entries, 0 to 1071
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FAERS_drug_match     1072 non-null   object 
 1   lookup_rank          1022 non-null   float64
 2   lookup_rank_related  1026 non-null   float64
dtypes: float64(2), object(1)
memory usage: 25.2+ KB


In [47]:
#number of 0 relevant results
test_mrr.loc[test_mrr['lookup_rank'].isna()].shape

(50, 3)

In [48]:
test_mrr.loc[test_mrr['lookup_rank_related'].isna()].shape

(46, 3)

In [49]:
test_mrr_exact = test_mrr[test_mrr['lookup_rank'].notna()]
test_mrr_exact = test_mrr_exact.drop(['lookup_rank_related'], axis=1)
test_mrr_exact.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1022 entries, 0 to 1071
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   FAERS_drug_match  1022 non-null   object 
 1   lookup_rank       1022 non-null   float64
dtypes: float64(1), object(1)
memory usage: 24.0+ KB


In [50]:
test_mrr_rel = test_mrr[test_mrr['lookup_rank_related'].notna()]
test_mrr_rel = test_mrr_rel.drop(['lookup_rank'], axis=1)
test_mrr_rel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1026 entries, 0 to 1071
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FAERS_drug_match     1026 non-null   object 
 1   lookup_rank_related  1026 non-null   float64
dtypes: float64(1), object(1)
memory usage: 24.0+ KB


In [51]:
exact_reciprocal = 1/test_mrr_exact['lookup_rank']
test_mrr_exact['reciprocal_rank'] = exact_reciprocal
test_mrr_exact.head()

Unnamed: 0,FAERS_drug_match,lookup_rank,reciprocal_rank
0,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),1.0,1.0
1,CORDYCEPS,1.0,1.0
2,FLAXSEEDS OIL,2.0,0.5
3,EXTRACT OF SERENOA REPENS (PERMIXON),2.0,0.5
4,GUARANA (GUARANA),1.0,1.0


In [52]:
##get the mean of reciprocal ranks for exact matches
test_mrr_exact.reciprocal_rank.mean()

0.7299739073711676

In [53]:
#get median and stdev
test_mrr_exact.lookup_rank.median()

2.0

In [54]:
test_mrr_exact.reciprocal_rank.median()

0.5

In [55]:
test_mrr_exact.lookup_rank.std()

0.6317946754922137

In [14]:
test_mrr_exact.reciprocal_rank.std()

0.26693768780742705

In [56]:
rel_reciprocal = 1/test_mrr_rel['lookup_rank_related']
test_mrr_rel['reciprocal_rank'] = rel_reciprocal
test_mrr_rel.head()

Unnamed: 0,FAERS_drug_match,lookup_rank_related,reciprocal_rank
0,ST. JOHN'S WORT ^HERRON^ (HYPERICUM PERFORATUM),1.0,1.0
1,CORDYCEPS,1.0,1.0
2,FLAXSEEDS OIL,1.0,1.0
3,EXTRACT OF SERENOA REPENS (PERMIXON),1.0,1.0
4,GUARANA (GUARANA),1.0,1.0


In [57]:
test_mrr_rel.reciprocal_rank.mean()

0.9265107212475634

In [58]:
test_mrr_rel.reciprocal_rank.median()

1.0

In [59]:
test_mrr_rel.reciprocal_rank.std()

0.18510530964102687

In [60]:
test_mrr_rel.lookup_rank_related.median()

1.0

In [61]:
test_mrr_rel.lookup_rank_related.std()

0.5098683830938465