# Imports & Globals

In [1]:
import numpy as np
import pandas as pd
import pickle
from yaml import safe_load

import re
import string
from typing import Literal

#import matplotlib.pyplot as plt
#import seaborn as sns
# %matplotlib inline

import tensorflow as tf
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# tf.enable_eager_execution()
# tf.executing_eagerly()


In [2]:
with open('LSTM65.yaml', 'r') as file:
    model_config = safe_load(file)

In [3]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = model_config['MAXLEN']

In [4]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

## Data Processing Functions

In [5]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character. 
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
    print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        
        # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
        if r <= 0.3333: # edit
            w = w[:p] + random.choice(string.ascii_uppercase) + w[p+1:]
        elif r<= 0.6667: # delete
            w = w[:p] + w[p+1:]
        else: # add
            w = w[:p] + random.choice(string.ascii_uppercase) + w[p:]
    return w

In [6]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [7]:
def encode(x: string) -> [int]:
    '''Applies the encoding function to a given value.
    
    Returns these string enconded into an array containing an integer mapping to each character and space (1-66) separately.
    
    Parameters
    ----------
    x : string
        
      
    Returns
    -------
    x : [int]
        Returns the encoded string.

    '''
    return list(map(encode_dict.get, x))

In [8]:
def padding(x: [int], maxlen: int = MAXLEN) -> [int]:
    '''Applies the padding function to the encoded sequence.
    
    Returns the enconded Series padded.
    
    Parameters
    ----------
    x : [int]
       Encoded character sequence.
      
    Returns
    -------
    x : [int]
        Returns the padded encoded character sequence.
    '''
    return x + ([0]* (maxlen-len(x)))

------------------------------------------------

# Data loading and preprocessing

# Pickled Datasets

In [9]:
dUnique_df = pd.read_pickle("../data/dUnique_df.pkl")
dfneg2 = pd.read_pickle("../data/dfneg2.pkl")
test = pd.read_pickle("../data/test.pkl")
validate = pd.read_pickle("../data/validate.pkl")

--------------------------------

# Build model, load weights and evaluate on test data

In [10]:
@tf.function(jit_compile=True)  # The decorator converts `cosine_similarity` into a tensolflow `Function`.
def cosine_similarity(vects: tf.TensorArray) -> tf.TensorArray:
    '''Cosine similarity to be calculated as sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
        
    
    Returns
    -------
    cosine_distance: tf.TensorArray
       The result of the cosine similarity between the vectors.    
    '''
    x, y = vects
    return tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True)))

In [11]:
@tf.function(jit_compile=True)  # The decorator converts `cosine_distance` into a tensolflow `Function`.
def cosine_distance(vects: tf.TensorArray) -> tf.TensorArray:
    '''Cosine distance to be calculated as 1-(cosine similarity).
    Where cosine similarity equals sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
        
    
    Returns
    -------
    cosine_distance: tf.TensorArray
        The result of 1-cosine similarity between the vectors. 
    '''
    x, y = vects
    return tf.math.subtract(tf.constant([1.0]), tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True))))

In [12]:
class ContrastiveLoss(tf.keras.losses.Loss):
    '''Returns a value between 0 and 1 representing the average error of the y_pred vector by comparing it to the y_true.
    '''  
    def __init__(self, margin: tf.float32 = 1.0):
        super(ContrastiveLoss, self).__init__()
        self.name = 'ContrastiveLoss'
        self.margin = tf.constant(margin)

    @tf.function(jit_compile=True)  # The decorator converts `loss` into a tensolflow `Function`.
    def call(self, y_true: tf.TensorArray, y_pred: tf.TensorArray) -> tf.Tensor:
        return tf.math.reduce_mean((1 - y_true) * tf.math.square(y_pred) + (y_true) * tf.math.square(tf.math.maximum(self.margin - (y_pred), 0.0)), axis = -1)

In [13]:
def build_model2(model_type: Literal["lstm", "gru"], embedding_dim: int, num_rnn_node: int, num_dense_node: int, num_layer: int, activation_fn: str, learning_rate: float, optimizer_fn: Literal["Adam", "RMSprop", "SGD"], margin: float, output_activation: str) -> tf.keras.Model: 
    '''Specifies the architecture of the model to be trained.
    '''
    input_x = tf.keras.layers.Input(MAXLEN)
    input_1 = tf.keras.layers.Input(MAXLEN)
    input_2 = tf.keras.layers.Input(MAXLEN)
    embedding = tf.keras.layers.Embedding(input_dim=28, output_dim=embedding_dim, mask_zero=True)
    x = embedding(input_x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    match model_type:
        case "lstm":
            x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(num_rnn_node))(x)
        case "gru":
            x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(num_rnn_node))(x)
        case "rnn":
            x = tf.keras.layers.Bidirectional(tf.keras.layers.RNN(num_rnn_node))(x)
 
    num = num_dense_node
    for _ in range(num_layer):
        x = tf.keras.layers.Dense(num, activation=activation_fn)(x)
        num /= 2
        
    embedding_network = tf.keras.Model(input_x, x)

    tower_1 = embedding_network(input_1)
    tower_2 = embedding_network(input_2)

    merge_layer = tf.keras.layers.Lambda(cosine_similarity)([tower_1, tower_2])
    normal_layer = tf.keras.layers.BatchNormalization()(merge_layer)
    output_layer = tf.keras.layers.Dense(1, activation="hard_sigmoid")(normal_layer)
    contr = tf.keras.Model(inputs=[input_1, input_2], outputs=output_layer)
    
    match optimizer_fn:
        case "Adam":
            opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        case "RMSprop":                
            opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
        case "SGD":                
            opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)

    contr.compile(loss=ContrastiveLoss(margin = margin), optimizer=opt, metrics=["accuracy"])
    
    return contr

In [14]:
# model = build_model2(model_type = "lstm", embedding_dim = 256, num_rnn_node = 248, num_dense_node = 124, num_layer = 1, activation_fn = "tanh", learning_rate = 2e-4, optimizer= "Adam", margin = 0.8)

In [15]:
initial_learning_rate = model_config['Learning_Rate']
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=1000,
    decay_rate=0.90,
    staircase=True)

model = build_model2(
    model_type =  model_config['Model_Type'], 
    embedding_dim = model_config['Embedding_Dimensions'], 
    num_rnn_node =  model_config['Number_RNN_Nodes'],
    num_dense_node =  model_config['Number_Dense_Nodes'], 
    num_layer =  model_config['Number_Layers'], 
    activation_fn =  model_config['Activation_Function'],
    learning_rate = lr_schedule,
    optimizer_fn = model_config['Optimizer'],
    margin =  model_config['Margin'],
    output_activation = model_config['Output_Activation']
)
model.summary()

2022-08-11 12:13:36.536531: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 12:13:36.561307: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 12:13:36.561477: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 12:13:36.561974: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 model (Functional)             (None, 100)          109220      ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

In [16]:
# checkpoint_filepath = "../ModelCheckpointSaves/maxlen-65/BEST_LSTM65/Wed-Aug-10-2022/03-25AM"
checkpoint_filepath = "../ModelCheckpointSaves/maxlen-65/BEST_LSTM65/Thu-Aug-11-2022/03-46AM"
checkpoint_filepath

'../ModelCheckpointSaves/maxlen-65/BEST_LSTM65/Thu-Aug-11-2022/03-46AM'

In [17]:
# model.load_weights("../exp3-hyperparameter-tuning/alstm-22-0.01.hdf5")
model.load_weights(checkpoint_filepath)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 model (Functional)             (None, 100)          109220      ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

---------------------------------------------------------------

# Evaluation

In [18]:
def preprocessInput(filename: str, **kwargs) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    Returns a Pandas Dataframe containing uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename, **kwargs)
    df.dropna()
    print(df.info())
    print("Processing file: ---------------------------------------")
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen:")
    for column in df.columns:
        print(column)
        df.drop(df[df[column].apply(len).gt(MAXLEN)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    # Uppercase all values
    print("\tUppercasing string sequences.")
    df = df.applymap(lambda x: str.upper(x))
    print("Done processing: --------------------------------------")
    print(df.info())
    return df

## Unique target labels

In [19]:
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq_padded
0,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,BUTCHERSBROOM,"[2, 21, 20, 3, 8, 5, 18, 19, 2, 18, 15, 15, 13..."
2,CATSCLAW,"[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,..."
3,CINNAMON,"[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0..."
4,FENUGREEK,"[6, 5, 14, 21, 7, 18, 5, 5, 11, 0, 0, 0, 0, 0,..."


## True Positives

In [20]:
fName = '../data/NP_FAERS_mapped_20220215.csv'
matches = preprocessInput(fName)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ---------------------------------------
Dropping sequences longer than the maxlen:
FAERS_drug_match
lookup_value
	Dropped 374 that exceeded the maximum sequence length.
	Uppercasing string sequences.
Done processing: --------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4984 entries, 1 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  4984 non-null   object
 1   lookup_value      4984 non-null   object
dtypes: object(2)
memory usage: 116.8+ KB
None


In [21]:
matches["Processed_FAERS_drug_match"] = matches.FAERS_drug_match.apply(clean).apply(encode).apply(padding)
matches["Processed_lookup_value"] = matches.lookup_value.apply(clean).apply(encode).apply(padding)

In [22]:
matches.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value
1,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
2,"ASHWAGANDHA,",ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
3,ASHWAGANDHA /01660201/,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
4,ASHWAGANDHA /01660201/,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
6,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."


### ASHWAGANDA

In [23]:
matches.loc[1, "FAERS_drug_match"]

'ASHWAGANDHA'

In [24]:
predicts = model.predict([np.tile(matches.loc[1, "Processed_FAERS_drug_match"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()

2022-08-11 12:13:43.495555: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401
2022-08-11 12:13:43.601089: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7f36846636b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-08-11 12:13:43.601109: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA GeForce RTX 2070 with Max-Q Design, Compute Capability 7.5
2022-08-11 12:13:43.603714: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-08-11 12:13:43.656947: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory




2022-08-11 12:13:43.711738: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [25]:
# Top-5 smalles distances
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ASHWAGANDA',
 'rank2': 'WITHANIA SOMNIFERA',
 'rank3': 'PAUSINYSTALIA JOHIMBE',
 'rank4': 'MORINGA',
 'rank5': 'PANAX GINSENG'}

### Echinacea

In [26]:
dfneg2[dfneg2["FAERS_drug_match"].str.contains("ECHINACEA")].head(2)

Unnamed: 0,FAERS_drug_match,lookup_value
0,ECHINACEA PRN,MACA
1,ECHINACEA PRN,RED YEAST RICE


In [27]:
i = 0
clean(dfneg2["FAERS_drug_match"][i])

'ECHINACEA PRN'

In [28]:
dUnique_df[dUnique_df["dUnique_label"] == "ECHINACEA"]

Unnamed: 0,dUnique_label,dUnique_seq_padded
20,ECHINACEA,"[5, 3, 8, 9, 14, 1, 3, 5, 1, 0, 0, 0, 0, 0, 0,..."


In [29]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["FAERS_drug_match"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()



In [30]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ECHINACEA',
 'rank2': 'ECHINACEA PURPUREA',
 'rank3': 'WOODLAND HAWTHORN',
 'rank4': 'CRATAEGUS LAEVIGATA',
 'rank5': 'WHEAT GRASS'}

### Cranberry

In [31]:
dfneg2[dfneg2["FAERS_drug_match"].str.contains("CRANBERRY")].head(2)

Unnamed: 0,FAERS_drug_match,lookup_value
8,AZO CRANBERRY URINARY TRACT,HARPAGOPHYTUM PROCUMBENS
9,AZO CRANBERRY URINARY TRACT,AESCULUS HIPPOCASTANUM


In [32]:
i = 8
clean(dfneg2["FAERS_drug_match"][8])

'AZO CRANBERRY URINARY TRACT'

In [33]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["FAERS_drug_match"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()
# argsort = predicts.flatten().argsort()



In [34]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'CRANBERRY',
 'rank2': 'VACCINIUM MACROCARPON',
 'rank3': 'HYPERICUM PERFORATUM',
 'rank4': 'ST JOHNSWORT',
 'rank5': 'TURMERIC'}

## True  Negatives

In [35]:
dfneg2.head()

Unnamed: 0,FAERS_drug_match,lookup_value
0,ECHINACEA PRN,MACA
1,ECHINACEA PRN,RED YEAST RICE
2,ECHINACEA PRN,LYCIUM BARBARUM
3,ECHINACEA PRN,SAMBUCUS NIGRA
4,KYOLIC AGED GARLIC EXTRACT,CHAMOMILE


# Evaluating on test data - NP names only

In [36]:
vocab = pd.read_csv('../data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


In [37]:
test = test.assign(rank1_drug="", rank2_drug="", rank3_drug="", rank4_drug="", rank5_drug="", rank1_distance=np.Inf, rank2_distance=np.Inf, rank3_distance=np.Inf, rank4_distance=np.Inf, rank5_distance=np.Inf, lookup_rank= np.Inf, lookup_rank_related = np.Inf)
test.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance,lookup_rank,lookup_rank_related
39833,CAT CLW,CATSCLAW,"[3, 1, 20, 27, 3, 12, 23, 0, 0, 0, 0, 0, 0, 0,...","[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
35009,GYMNEMA HERBALTEAU,MACA,"[7, 25, 13, 14, 5, 13, 1, 27, 8, 5, 18, 2, 1, ...","[13, 1, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
4045,CRANBERRY AND VITAMIN C,VACCINIUM MACROCARPON,"[3, 18, 1, 14, 2, 5, 18, 18, 25, 27, 1, 14, 4,...","[22, 1, 3, 3, 9, 14, 9, 21, 13, 27, 13, 1, 3, ...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
24983,CITRUS X PARADISI JUICE,TANACETUM PARTHENIUM,"[3, 9, 20, 18, 21, 19, 27, 24, 27, 16, 1, 18, ...","[20, 1, 14, 1, 3, 5, 20, 21, 13, 27, 16, 1, 18...",0,,,,,,inf,inf,inf,inf,inf,inf,inf
7974,PAEONIAE RADIX LICORICE EXTRACT,RED YEAST RICE,"[16, 1, 5, 15, 14, 9, 1, 5, 27, 18, 1, 4, 9, 2...","[18, 5, 4, 27, 25, 5, 1, 19, 20, 27, 18, 9, 3,...",0,,,,,,inf,inf,inf,inf,inf,inf,inf


# Evaluation of drug name predictions
### Find ranks 1-n from the predicted similarities for the test data

In [38]:
def find_ranks(model: tf.keras.Model, df: pd.DataFrame, related_rank: bool = False) -> pd.DataFrame:
    """For each row in the test set (input), use the model to predict if the 'FAERS_drug_match' entry matches any of the 'lookup_value' entries.
       This is done at the encoded sequence level for both name all unique drugnames
       
         Parameters
    ----------
    model : tf.keras.Model
        A Keras model based Siamese Network that takes three inputs. 
        Namely, two input sequeces and a third input binary target specifying wether the two sequeces match.
    y : pd.Series
        A pandas Series containing the clean encoded 'lookup_value' column.
      
    Returns
    -------
    x : pd.Series
        Returns the padded 'FAERS_drug_match' series.
    y : pd.Series 
        Returns the padded 'lookup_value' series.
    
    """
    
    print("Using column: ", df.columns[2])
    for i in df.index:
        predicts = model.predict([np.tile(df.loc[i, "Processed_FAERS_drug_match"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
        argsort = predicts.flatten().argsort()
        # Top-5 smalles distances
        df.at[i, 'rank1_drug'] = dUnique_df['dUnique_label'][argsort[-1]]  
        df.at[i, 'rank2_drug'] = dUnique_df['dUnique_label'][argsort[-2]]
        df.at[i, 'rank3_drug'] = dUnique_df['dUnique_label'][argsort[-3]]
        df.at[i, 'rank4_drug'] = dUnique_df['dUnique_label'][argsort[-4]]
        df.at[i, 'rank5_drug'] = dUnique_df['dUnique_label'][argsort[-5]]
        df.at[i, 'rank1_distance'] = predicts[argsort[-1]]  
        df.at[i, 'rank2_distance'] = predicts[argsort[-2]]
        df.at[i, 'rank3_distance'] = predicts[argsort[-3]]
        df.at[i, 'rank4_distance'] = predicts[argsort[-4]]
        df.at[i, 'rank5_distance'] = predicts[argsort[-5]]
        if related_rank:
            # Does any of them match
            lookup_clean = df.loc[i]['lookup_value']
            predicted_rank = df.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_clean).to_numpy().nonzero()

            lookup_rank = np.Inf    
            if len(predicted_rank[0]) > 0 :
                lookup_rank = predicted_rank[0][0] + 1
                df.loc[i, 'lookup_rank'] = lookup_rank

            # Let's compare to latin binomial
            lb_res = vocab.loc[vocab['latin_binomial_clean'] == lookup_clean]
            common_res = vocab.loc[vocab['common_name_clean'] == lookup_clean]
            lookup_result = ''
            if len(lb_res) > 0:
                lookup_result = lb_res.common_name_clean.values[0]
            elif len(common_res) > 0:
                lookup_result = common_res.latin_binomial_clean.values[0]

            related_rank = np.Inf
            if lookup_result != '':
                annotated_rank = df.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_result).to_numpy().nonzero()
                if len(annotated_rank[0]) > 0: 
                    related_rank = annotated_rank[0][0] + 1

            #find related mappings to lookup value in predicted values 
            df.loc[i, 'lookup_rank_related'] = min(lookup_rank, related_rank)

    return subsample

## Assing ranks to the matching 
matches are assigned their corresponding rank
non-matches are left null

In [39]:
subsample = test[test["Match"] == 1].sample(n=50, ignore_index=True)
predicted = find_ranks(model, subsample, True)

Using column:  Processed_FAERS_drug_match


In [40]:
predicted.head(20)

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance,lookup_rank,lookup_rank_related
0,CRLANBOERRY CRANBERRY,CRANBERRY,"[3, 18, 12, 1, 14, 2, 15, 5, 18, 18, 25, 27, 3...","[3, 18, 1, 14, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0...",1,CRANBERRY,VACCINIUM MACROCARPON,LYCIUM BARBARUM,STEVIA REBAUDIANA,ELDERBERRY,1.0,1.0,0.107672,0.0,0.0,1.0,1.0
1,MIL FIZZLE,SILYBUM MARIANUM,"[13, 9, 12, 27, 6, 9, 26, 26, 12, 5, 0, 0, 0, ...","[19, 9, 12, 25, 2, 21, 13, 27, 13, 1, 18, 9, 1...",1,SILYBUM MARIANUM,FLAX SEED,LINUM USITATISSIMUM,MILK THISTLE,GINGER,1.0,1.0,1.0,1.0,0.36429,1.0,1.0
2,FLAX SEED CAG,FLAX SEED,"[6, 12, 1, 24, 27, 19, 5, 5, 4, 27, 3, 1, 7, 0...","[6, 12, 1, 24, 27, 19, 5, 5, 4, 0, 0, 0, 0, 0,...",1,FLAX SEED,LINUM USITATISSIMUM,OCIMUM TENUIFLORUM,GREEN TEA,CAMELLIA SINENSIS,1.0,1.0,0.175437,0.084084,0.034021,1.0,1.0
3,ALLIUMSATYVURMALLIUM ATIVUM,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 19, 1, 20, 25, 22, 21, ...","[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...",1,GARLIC,ALLIUM SATIVUM,REISHI,TARAXACUM OFFICINALE,SWALLOWWORT,1.0,1.0,0.542126,0.458511,0.280641,2.0,1.0
4,EVENING RIMOTSJEP OIL OENOTHRA BKIENNIS OIL,EVENING PRIMROSE OIL,"[5, 22, 5, 14, 9, 14, 7, 27, 18, 9, 13, 15, 20...","[5, 22, 5, 14, 9, 14, 7, 27, 16, 18, 9, 13, 18...",1,EVENING PRIMROSE OIL,OENOTHERA BIENNIS,SCRUBPALMETTO,GINGER,RUSCUS ACULEATUS,1.0,1.0,0.210953,0.087281,0.058858,1.0,1.0
5,CRANBER RY CONWCENTREATE ORAL,VACCINIUM MACROCARPON,"[3, 18, 1, 14, 2, 5, 18, 27, 18, 25, 27, 3, 15...","[22, 1, 3, 3, 9, 14, 9, 21, 13, 27, 13, 1, 3, ...",1,CRANBERRY,VACCINIUM MACROCARPON,STEVIA REBAUDIANA,ELDERBERRY,HORSETAIL,1.0,1.0,0.0,0.0,0.0,2.0,1.0
6,ECHINCHEA,ECHINACEA,"[5, 3, 8, 9, 14, 3, 8, 5, 1, 0, 0, 0, 0, 0, 0,...","[5, 3, 8, 9, 14, 1, 3, 5, 1, 0, 0, 0, 0, 0, 0,...",1,ECHINACEA PURPUREA,ECHINACEA,WOODLAND HAWTHORN,MATRICARIA CHAMOMILLA,IVY LEAF,1.0,1.0,0.334851,0.250624,0.0,2.0,2.0
7,CYSANTHEMUMPARTHENIUM,FEVERFEW,"[3, 25, 19, 1, 14, 20, 8, 5, 13, 21, 13, 16, 1...","[6, 5, 22, 5, 18, 6, 5, 23, 0, 0, 0, 0, 0, 0, ...",1,FEVERFEW,TANACETUM PARTHENIUM,UNCARIA TOMENTOSA,IVY LEAF,URTICA DIOICA,1.0,1.0,0.991032,0.409947,0.318095,1.0,1.0
8,BOSWELLI SERRAA,BOSWELLIA,"[2, 15, 19, 23, 5, 12, 12, 9, 27, 19, 5, 18, 1...","[2, 15, 19, 23, 5, 12, 12, 9, 1, 0, 0, 0, 0, 0...",1,BOSWELLIA,BOSWELLIA SERRATA,ALOE VERA,SWALLOWWORT,OLEA EUROPAEA,1.0,1.0,0.484877,0.369754,0.271316,1.0,1.0
9,FLAXSEED GRANULES,LINUM USITATISSIMUM,"[6, 12, 1, 24, 19, 5, 5, 4, 27, 7, 18, 1, 14, ...","[12, 9, 14, 21, 13, 27, 21, 19, 9, 20, 1, 20, ...",1,LINUM USITATISSIMUM,FLAX SEED,SCRUBPALMETTO,STEVIA REBAUDIANA,IVY LEAF,1.0,1.0,0.178301,0.0,0.0,1.0,1.0


# Add related mappings rank to test set evaluation

In [41]:
test = find_ranks(model, test[test["Match"]==1],  True)

Using column:  Processed_FAERS_drug_match




















































----------------------------------------------------

In [42]:
if "test" not in locals():
    test = pd.read_csv('../evaluation/test_siamese_evaluation_lstm_model_np_name.csv')
    test.info()

In [43]:
test_mrr = test[['FAERS_drug_match', 'lookup_rank', 'lookup_rank_related']]
test_mrr.head()

Unnamed: 0,FAERS_drug_match,lookup_rank,lookup_rank_related
0,CRLANBOERRY CRANBERRY,1.0,1.0
1,MIL FIZZLE,1.0,1.0
2,FLAX SEED CAG,1.0,1.0
3,ALLIUMSATYVURMALLIUM ATIVUM,2.0,1.0
4,EVENING RIMOTSJEP OIL OENOTHRA BKIENNIS OIL,1.0,1.0


In [44]:
test_mrr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FAERS_drug_match     50 non-null     object 
 1   lookup_rank          50 non-null     float64
 2   lookup_rank_related  50 non-null     float64
dtypes: float64(2), object(1)
memory usage: 1.3+ KB


In [45]:
#number of 0 relevant results
test_mrr.loc[test_mrr['lookup_rank'].isna()].shape

(0, 3)

In [46]:
test_mrr.loc[test_mrr['lookup_rank_related'].isna()].shape

(0, 3)

In [47]:
test_mrr_exact = test_mrr[test_mrr['lookup_rank'].notna()]
test_mrr_exact = test_mrr_exact.drop(['lookup_rank_related'], axis=1)
test_mrr_exact.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   FAERS_drug_match  50 non-null     object 
 1   lookup_rank       50 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [48]:
test_mrr_rel = test_mrr[test_mrr['lookup_rank_related'].notna()]
test_mrr_rel = test_mrr_rel.drop(['lookup_rank'], axis=1)
test_mrr_rel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FAERS_drug_match     50 non-null     object 
 1   lookup_rank_related  50 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [49]:
exact_reciprocal = 1/test_mrr_exact['lookup_rank']
test_mrr_exact['reciprocal_rank'] = exact_reciprocal
test_mrr_exact.head()

Unnamed: 0,FAERS_drug_match,lookup_rank,reciprocal_rank
0,CRLANBOERRY CRANBERRY,1.0,1.0
1,MIL FIZZLE,1.0,1.0
2,FLAX SEED CAG,1.0,1.0
3,ALLIUMSATYVURMALLIUM ATIVUM,2.0,0.5
4,EVENING RIMOTSJEP OIL OENOTHRA BKIENNIS OIL,1.0,1.0


In [50]:
##get the mean of reciprocal ranks for exact matches
test_mrr_exact.reciprocal_rank.mean()

0.6766666666666667

In [51]:
#get median and stdev
test_mrr_exact.lookup_rank.median()

2.0

In [52]:
test_mrr_exact.reciprocal_rank.median()

0.5

In [53]:
test_mrr_exact.lookup_rank.std()

0.8964783708421274

In [54]:
test_mrr_exact.reciprocal_rank.std()

0.2875141719863049

In [55]:
rel_reciprocal = 1/test_mrr_rel['lookup_rank_related']
test_mrr_rel['reciprocal_rank'] = rel_reciprocal
test_mrr_rel.head()

Unnamed: 0,FAERS_drug_match,lookup_rank_related,reciprocal_rank
0,CRLANBOERRY CRANBERRY,1.0,1.0
1,MIL FIZZLE,1.0,1.0
2,FLAX SEED CAG,1.0,1.0
3,ALLIUMSATYVURMALLIUM ATIVUM,1.0,1.0
4,EVENING RIMOTSJEP OIL OENOTHRA BKIENNIS OIL,1.0,1.0


In [56]:
test_mrr_rel.reciprocal_rank.mean()

0.8783333333333333

In [57]:
test_mrr_rel.reciprocal_rank.median()

1.0

In [58]:
test_mrr_rel.reciprocal_rank.std()

0.2352267685725016

In [59]:
test_mrr_rel.lookup_rank_related.median()

1.0

In [60]:
test_mrr_rel.lookup_rank_related.std()

0.6468132241526727

--------------------------------------

# Translation tests

In [61]:
translation = '../data/translation_test_nps_202203171038.csv'

In [62]:
# test["Processed_np_name"] = test.np_name.apply(clean).apply(encode).apply(padding)

In [63]:
# test = preprocessInput(unmapped, converters = {"drug_name_original":str}, skip_blank_lines=True, na_filter=True, na_values="")
# test["Processed_drug_name_original"] = test[test.columns[0]].apply(clean).apply(encode).apply(padding)