# Imports & Globals

In [1]:
import numpy as np
import pandas as pd
import pickle
from yaml import safe_load

import re
import string
from typing import Literal

#import matplotlib.pyplot as plt
#import seaborn as sns
# %matplotlib inline

In [2]:
import tensorflow as tf
#@title Versions:
print("tf.version: ", tf.version.VERSION)
print("tf.keras.version: ", tf.keras.__version__)

tf.version:  2.9.1
tf.keras.version:  2.9.0


In [3]:
# from tensorflow.python.ops.numpy_ops import np_config
# np_config.enable_numpy_behavior()
# tf.enable_eager_execution()
tf.executing_eagerly()

True

In [4]:
with open('LSTM65.yaml', 'r') as file:
    model_config = safe_load(file)

In [5]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = model_config['MAXLEN']

In [6]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

## Data Processing Functions

In [7]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character or space.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character or space.
    
    In order to retain the length of the sequence compliant with the maximum sequence length,
    additional processing has been added such that sequences that reach the maximum sequence length
    can only be modified by removing or swapping characters.
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
#     print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        if len(w) < MAXLEN:
            # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
            if r <= 0.3333: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            elif r<= 0.6667: # delete
                w = w[:p] + w[p+1:]
            else: # add
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p:]
        else:
            if r <= 0.5: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            else: # delete
                w = w[:p] + w[p+1:]
            
    return w

In [8]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [9]:
def encode(x: string) -> [int]:
    '''Applies the encoding function to a given value.
    
    Returns these string enconded into an array containing an integer mapping to each character and space (1-66) separately.
    
    Parameters
    ----------
    x : string
        
      
    Returns
    -------
    x : [int]
        Returns the encoded string.

    '''
    return list(map(encode_dict.get, x))

In [10]:
def padding(x: [int], maxlen: int = MAXLEN) -> [int]:
    '''Applies the padding function to the encoded sequence.
    
    Returns the enconded Series padded.
    
    Parameters
    ----------
    x : [int]
       Encoded character sequence.
      
    Returns
    -------
    x : [int]
        Returns the padded encoded character sequence.
    '''
    return x + ([0]* (maxlen-len(x)))

In [11]:
def preprocessInput(filename: str, maxlen: int = MAXLEN, **kwargs) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    It then removes the sequences with length greater than the maximun sequence length, cleans the sequences and
    uppercases them, and it finally drops any duplicates that might have arrisen from this processing.
    Returns a Pandas Dataframe containing unique cleaned and uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename)
    print(df.info())
    
    print("Processing file: ----------------------------------------")
    
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen:")
    for column in df.columns:
        df.drop(df[df[column].apply(len).gt(maxlen)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    
    print("\tCleaning string sequences:")
    df = df.applymap(clean)
    
    print("\tUppercasing string sequences:")
    df = df.applymap(lambda x: str.upper(x))
    
    print("Dropping duplicate sequences:")
    original_count = len(df.index)
    df.drop_duplicates(ignore_index=True, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "duplicate sequences.")
    
    print("Done processing: ---------------------------------------")
    print(df.info())
    return df

------------------------------------------------

# Data loading and preprocessing

# Pickled Datasets

In [12]:
dUnique_df = pd.read_pickle("../data/dUnique_df.pkl")
dfneg2 = pd.read_pickle("../data/dfneg2.pkl")
test = pd.read_pickle("../data/test.pkl")
validate = pd.read_pickle("../data/validate.pkl")

--------------------------------

# Build model, load weights and evaluate on test data

In [13]:
class CosineSimilarity(tf.keras.layers.Layer):
    '''Cosine similarity to be calculated as sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
    
    Returns
    -------
    cosine_similarity: tf.TensorArray
       The result of the cosine similarity between the vectors.    
    '''
    __name__ = 'CosineSimilarity'
    def __init__(self, **kwargs):
        super(CosineSimilarity, self).__init__()
       
    @tf.function(jit_compile=True)  # The decorator converts `cosine_similarity` into a tensolflow `Function`.
    def call(self, vects: tf.TensorArray) -> tf.TensorArray:
        x, y = vects
        return tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True)))

    def get_config(self):
        return super(CosineSimilarity, self).get_config()
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [14]:
class ContrastiveLoss(tf.keras.losses.Loss):
    '''Returns a value between 0 and 1 representing the average error of the y_pred vector by comparing it to the y_true.
    '''
    __name__ = 'ContrastiveLoss'
    def __init__(self, margin: tf.float32 = 1.0, **kwargs):
        super(ContrastiveLoss, self).__init__()
        self.margin = tf.constant(margin)
        
    @tf.function(jit_compile=True)  # The decorator converts `loss` into a tensolflow `Function`.
    def call(self, y_true: tf.TensorArray, y_pred: tf.TensorArray) -> tf.Tensor:
        return tf.math.reduce_mean((1 - y_true) * tf.math.square(y_pred) + (y_true) * tf.math.square(tf.math.maximum(self.margin - (y_pred), 0.0)), axis = -1)
    
    def get_config(self):
        config = super(ContrastiveLoss, self).get_config()
        config.update({
            "margin": str(self.margin)
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [15]:
model = tf.keras.models.load_model(
    './saved_models/{}'.format(model_config["Model_Name"]), 
    custom_objects = {
        'CosineSimilarity': CosineSimilarity,
        'ContrastiveLoss': ContrastiveLoss
    }
)
model.summary()

2022-08-11 18:24:25.486941: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 18:24:25.513020: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 18:24:25.513180: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-11 18:24:25.513598: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

2022-08-11 18:24:32.349048: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-11 18:24:32.358170: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-11 18:24:33.032411: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-11 18:24:33.042269: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-11 18:24:33.117571: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes at

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 model (Functional)             (None, 100)          109220      ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

---------------------------------------------------------------

# Evaluation

## Unique target labels

In [16]:
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq_padded
0,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,BUTCHERSBROOM,"[2, 21, 20, 3, 8, 5, 18, 19, 2, 18, 15, 15, 13..."
2,CATSCLAW,"[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,..."
3,CINNAMON,"[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0..."
4,FENUGREEK,"[6, 5, 14, 21, 7, 18, 5, 5, 11, 0, 0, 0, 0, 0,..."


## True Positives

In [17]:
fName = '../data/NP_FAERS_mapped_20220215.csv'
matches = preprocessInput(fName)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ----------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 374 that exceeded the maximum sequence length.
	Cleaning string sequences:
	Uppercasing string sequences:
Dropping duplicate sequences:
	Dropped 482 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  4502 non-null   object
 1   lookup_value      4502 non-null   object
dtypes: object(2)
memory usage: 70.5+

In [18]:
matches["Processed_FAERS_drug_match"] = matches.FAERS_drug_match.apply(clean).apply(encode).apply(padding)
matches["Processed_lookup_value"] = matches.lookup_value.apply(clean).apply(encode).apply(padding)

In [19]:
matches.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."


### ASHWAGANDA

In [20]:
matches.loc[1, "FAERS_drug_match"]

'ASHWAGANDHA EXTRACT'

In [21]:
predicts = model.predict([np.tile(matches.loc[1, "Processed_FAERS_drug_match"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()

2022-08-11 18:24:41.421708: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8401
2022-08-11 18:24:41.536125: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7f80e401b930 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-08-11 18:24:41.536145: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA GeForce RTX 2070 with Max-Q Design, Compute Capability 7.5
2022-08-11 18:24:41.539019: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-08-11 18:24:41.596873: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory




2022-08-11 18:24:41.646055: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


In [22]:
# Top-5 smalles distances
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ASHWAGANDA',
 'rank2': 'WITHANIA SOMNIFERA',
 'rank3': 'WOODLAND HAWTHORN',
 'rank4': 'PANAX GINSENG',
 'rank5': 'CRATAEGUS LAEVIGATA'}

### Echinacea

In [23]:
Echinacea = dfneg2[dfneg2["FAERS_drug_match"].str.contains("ECHINACEA")].head(1)
Echinacea

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
393,ECHINACEA TEA ECHINACEA PURPUREA,GREEN TEA,"[5, 3, 8, 9, 14, 1, 3, 5, 1, 27, 20, 5, 1, 27,...","[7, 18, 5, 5, 14, 27, 20, 5, 1, 0, 0, 0, 0, 0,...",0


In [24]:
i = Echinacea.index.values[0]
clean(dfneg2["FAERS_drug_match"][i])

'ECHINACEA TEA ECHINACEA PURPUREA'

In [26]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["FAERS_drug_match"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()



In [27]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ECHINACEA',
 'rank2': 'ECHINACEA PURPUREA',
 'rank3': 'CURCUMA LONGA',
 'rank4': 'CHLORELLA VULGARIS',
 'rank5': 'TARAXACUM OFFICINALE'}

### Cranberry

In [28]:
cranberry = dUnique_df[dUnique_df["dUnique_label"].str.contains("CRANBERRY")].head(1)
cranberry

Unnamed: 0,dUnique_label,dUnique_seq_padded
58,CRANBERRY,"[3, 18, 1, 14, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0..."


In [29]:
i = cranberry.index.values[0]
clean(dUnique_df["dUnique_label"][i])

'CRANBERRY'

In [30]:
predicts = model.predict([np.tile(padding(encode(clean(dUnique_df["dUnique_label"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()
# argsort = predicts.flatten().argsort()



In [31]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'CRANBERRY',
 'rank2': 'VACCINIUM MACROCARPON',
 'rank3': 'STEVIA REBAUDIANA',
 'rank4': 'ELDERBERRY',
 'rank5': 'HORSETAIL'}

## True  Negatives

In [32]:
dfneg2.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0


# Evaluating on test data - NP names only

In [33]:
vocab = pd.read_csv('../data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


In [34]:
test = test.assign(rank1_drug="", rank2_drug="", rank3_drug="", rank4_drug="", rank5_drug="", rank1_distance=np.Inf, rank2_distance=np.Inf, rank3_distance=np.Inf, rank4_distance=np.Inf, rank5_distance=np.Inf, lookup_rank= np.Inf, lookup_rank_related = np.Inf)
test.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance,lookup_rank,lookup_rank_related
733,CURCUMA ZEDOARIA,KARCURA,"[3, 21, 18, 3, 21, 13, 1, 27, 26, 5, 4, 15, 1,...","[11, 1, 18, 3, 21, 18, 1, 0, 0, 0, 0, 0, 0, 0,...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
22364,HORSERADISH,ANGELICA SINENSIS,"[8, 15, 18, 19, 5, 18, 1, 4, 9, 19, 8, 0, 0, 0...","[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...",0,,,,,,inf,inf,inf,inf,inf,inf,inf
38149,BILOORGAGNICS CRANBERRY PUS,VACCINIUM MACROCARPON,"[2, 9, 12, 15, 15, 18, 7, 1, 7, 14, 9, 3, 19, ...","[22, 1, 3, 3, 9, 14, 9, 21, 13, 27, 13, 1, 3, ...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
18046,ALLIUM SATIVUM GARLICIN CARDIO,BOSWELLIA,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...","[2, 15, 19, 23, 5, 12, 12, 9, 1, 0, 0, 0, 0, 0...",0,,,,,,inf,inf,inf,inf,inf,inf,inf
7650,MALIC,GARLIC,"[13, 1, 12, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[7, 1, 18, 12, 9, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0...",0,,,,,,inf,inf,inf,inf,inf,inf,inf


# Evaluation of drug name predictions
### Find ranks 1-n from the predicted similarities for the test data

In [35]:
def find_ranks(model: tf.keras.Model, df: pd.DataFrame, related_rank: bool = False) -> pd.DataFrame:
    """For each row in the test set (input), use the model to predict if the 'FAERS_drug_match' entry matches any of the 'lookup_value' entries.
       This is done at the encoded sequence level for both name all unique drugnames
       
         Parameters
    ----------
    model : tf.keras.Model
        A Keras model based Siamese Network that takes three inputs. 
        Namely, two input sequeces and a third input binary target specifying wether the two sequeces match.
    y : pd.Series
        A pandas Series containing the clean encoded 'lookup_value' column.
      
    Returns
    -------
    x : pd.Series
        Returns the padded 'FAERS_drug_match' series.
    y : pd.Series 
        Returns the padded 'lookup_value' series.
    
    """
    
    print("Using column: ", df.columns[2])
    for i in df.index:
        predicts = model.predict([np.tile(df.loc[i, "Processed_FAERS_drug_match"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
        argsort = predicts.flatten().argsort()
        # Top-5 smalles distances
        df.at[i, 'rank1_drug'] = dUnique_df['dUnique_label'][argsort[-1]]  
        df.at[i, 'rank2_drug'] = dUnique_df['dUnique_label'][argsort[-2]]
        df.at[i, 'rank3_drug'] = dUnique_df['dUnique_label'][argsort[-3]]
        df.at[i, 'rank4_drug'] = dUnique_df['dUnique_label'][argsort[-4]]
        df.at[i, 'rank5_drug'] = dUnique_df['dUnique_label'][argsort[-5]]
        df.at[i, 'rank1_distance'] = predicts[argsort[-1]]  
        df.at[i, 'rank2_distance'] = predicts[argsort[-2]]
        df.at[i, 'rank3_distance'] = predicts[argsort[-3]]
        df.at[i, 'rank4_distance'] = predicts[argsort[-4]]
        df.at[i, 'rank5_distance'] = predicts[argsort[-5]]
        if related_rank:
            # Does any of them match
            lookup_clean = df.loc[i]['lookup_value']
            predicted_rank = df.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_clean).to_numpy().nonzero()

            lookup_rank = np.Inf    
            if len(predicted_rank[0]) > 0 :
                lookup_rank = predicted_rank[0][0] + 1
                df.loc[i, 'lookup_rank'] = lookup_rank

            # Let's compare to latin binomial
            lb_res = vocab.loc[vocab['latin_binomial_clean'] == lookup_clean]
            common_res = vocab.loc[vocab['common_name_clean'] == lookup_clean]
            lookup_result = ''
            if len(lb_res) > 0:
                lookup_result = lb_res.common_name_clean.values[0]
            elif len(common_res) > 0:
                lookup_result = common_res.latin_binomial_clean.values[0]

            related_rank = np.Inf
            if lookup_result != '':
                annotated_rank = df.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_result).to_numpy().nonzero()
                if len(annotated_rank[0]) > 0: 
                    related_rank = annotated_rank[0][0] + 1

            #find related mappings to lookup value in predicted values 
            df.loc[i, 'lookup_rank_related'] = min(lookup_rank, related_rank)

    return subsample

## Assing ranks to the matching 
matches are assigned their corresponding rank
non-matches are left null

In [36]:
subsample = test[test["Match"] == 1].sample(n=50, ignore_index=True)
predicted = find_ranks(model, subsample, True)

Using column:  Processed_FAERS_drug_match


In [37]:
predicted.head(20)

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance,lookup_rank,lookup_rank_related
0,LAEGAL MARIJUAN,HEMP EXTRACT,"[12, 1, 5, 7, 1, 12, 27, 13, 1, 18, 9, 10, 21,...","[8, 5, 13, 16, 27, 5, 24, 20, 18, 1, 3, 20, 0,...",1,CANNABIS SATIVA,HEMP EXTRACT,WOOD SPIDER,HARPAGOPHYTUM PROCUMBENS,ANGELICA SINENSIS,1.0,1.0,0.805742,0.737934,0.553547,2.0,1.0
1,APAYA SEEDS ASZWAZANDHA,ASHWAGANDA,"[1, 16, 1, 25, 1, 27, 19, 5, 5, 4, 19, 27, 1, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,ASHWAGANDA,WITHANIA SOMNIFERA,CANNABIS SATIVA,HEMP EXTRACT,WOODLAND HAWTHORN,1.0,1.0,0.93241,0.741244,0.561074,1.0,1.0
2,SERENOA REPENS PERMIXON,SERENOA REPENS,"[19, 5, 18, 5, 14, 15, 1, 27, 18, 5, 16, 5, 14...","[19, 5, 18, 5, 14, 15, 1, 27, 18, 5, 16, 5, 14...",1,SERENOA REPENS,OENOTHERA BIENNIS,SCRUBPALMETTO,FEVERFEW,TANACETUM PARTHENIUM,1.0,1.0,1.0,0.903652,0.782204,1.0,1.0
3,AWM PALMETTOSELENIUMLIBCOPENE,SERENOA REPENS,"[1, 23, 13, 27, 16, 1, 12, 13, 5, 20, 20, 15, ...","[19, 5, 18, 5, 14, 15, 1, 27, 18, 5, 16, 5, 14...",1,SERENOA REPENS,SCRUBPALMETTO,OENOTHERA BIENNIS,MACA,TANACETUM PARTHENIUM,1.0,1.0,0.976923,0.81473,0.643705,1.0,1.0
4,GREEN TEA COMLET,GREEN TEA,"[7, 18, 5, 5, 14, 27, 20, 5, 1, 27, 3, 15, 13,...","[7, 18, 5, 5, 14, 27, 20, 5, 1, 0, 0, 0, 0, 0,...",1,GREEN TEA,CAMELLIA SINENSIS,BLACK COHOSH,STEVIA REBAUDIANA,IVY LEAF,1.0,1.0,0.06042,0.0,0.0,1.0,1.0
5,FLAX OILFISH OIL,FLAX SEED,"[6, 12, 1, 24, 27, 15, 9, 12, 6, 9, 19, 8, 27,...","[6, 12, 1, 24, 27, 19, 5, 5, 4, 0, 0, 0, 0, 0,...",1,FLAX SEED,LINUM USITATISSIMUM,CAMELLIA SINENSIS,STEVIA REBAUDIANA,GARCINIA,1.0,1.0,0.082101,0.0,0.0,1.0,1.0
6,ACV PLUS WITH GREEN TEA,APPLE CIDER VINEGAR,"[1, 3, 22, 27, 16, 12, 21, 19, 27, 23, 9, 20, ...","[1, 16, 16, 12, 5, 27, 3, 9, 4, 5, 18, 27, 22,...",1,MALUS DOMESTICA,APPLE CIDER VINEGAR,CAMELLIA SINENSIS,GREEN TEA,SAMBUCUS NIGRA,1.0,1.0,1.0,1.0,0.276322,2.0,1.0
7,CBD YEMP FLOER CBDTHC,HEMP EXTRACT,"[3, 2, 4, 27, 25, 5, 13, 16, 27, 6, 12, 15, 5,...","[8, 5, 13, 16, 27, 5, 24, 20, 18, 1, 3, 20, 0,...",1,CANNABIS SATIVA,CHAMOMILE,HEMP EXTRACT,BLACK CUMIN,UNCARIA TOMENTOSA,1.0,0.477712,0.458954,0.41226,0.173214,3.0,1.0
8,FLAXSEEN GRANULIS,FLAX SEED,"[6, 12, 1, 24, 19, 5, 5, 14, 27, 7, 18, 1, 14,...","[6, 12, 1, 24, 27, 19, 5, 5, 4, 0, 0, 0, 0, 0,...",1,PANAX GINSENG,LINUM USITATISSIMUM,VALERIANA OFFICINALIS,CAMELLIA SINENSIS,TANACETUM PARTHENIUM,1.0,0.867634,0.706903,0.354993,0.297593,inf,2.0
9,CINNAMDN W CHROMIPUM,CINNAMOMUM VERUM,"[3, 9, 14, 14, 1, 13, 4, 14, 27, 23, 27, 3, 8,...","[3, 9, 14, 14, 1, 13, 15, 13, 21, 13, 27, 22, ...",1,CINNAMOMUM VERUM,CINNAMON,HEMP EXTRACT,CANNABIS SATIVA,MITRAGYNA SPECIOSA,1.0,1.0,0.687005,0.597143,0.266563,1.0,1.0


# Add related mappings rank to test set evaluation

In [38]:
test = find_ranks(model, test[test["Match"]==1],  True)

Using column:  Processed_FAERS_drug_match




















































----------------------------------------------------

In [39]:
if "test" not in locals():
    test = pd.read_csv('../evaluation/test_siamese_evaluation_lstm_model_np_name.csv')
    test.info()

In [40]:
test_mrr = test[['FAERS_drug_match', 'lookup_rank', 'lookup_rank_related']]
test_mrr.head()

Unnamed: 0,FAERS_drug_match,lookup_rank,lookup_rank_related
0,LAEGAL MARIJUAN,2.0,1.0
1,APAYA SEEDS ASZWAZANDHA,1.0,1.0
2,SERENOA REPENS PERMIXON,1.0,1.0
3,AWM PALMETTOSELENIUMLIBCOPENE,1.0,1.0
4,GREEN TEA COMLET,1.0,1.0


In [41]:
test_mrr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FAERS_drug_match     50 non-null     object 
 1   lookup_rank          50 non-null     float64
 2   lookup_rank_related  50 non-null     float64
dtypes: float64(2), object(1)
memory usage: 1.3+ KB


In [42]:
#number of 0 relevant results
test_mrr.loc[test_mrr['lookup_rank'].isna()].shape

(0, 3)

In [43]:
test_mrr.loc[test_mrr['lookup_rank_related'].isna()].shape

(0, 3)

In [44]:
test_mrr_exact = test_mrr[test_mrr['lookup_rank'].notna()]
test_mrr_exact = test_mrr_exact.drop(['lookup_rank_related'], axis=1)
test_mrr_exact.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   FAERS_drug_match  50 non-null     object 
 1   lookup_rank       50 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [45]:
test_mrr_rel = test_mrr[test_mrr['lookup_rank_related'].notna()]
test_mrr_rel = test_mrr_rel.drop(['lookup_rank'], axis=1)
test_mrr_rel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   FAERS_drug_match     50 non-null     object 
 1   lookup_rank_related  50 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [46]:
exact_reciprocal = 1/test_mrr_exact['lookup_rank']
test_mrr_exact['reciprocal_rank'] = exact_reciprocal
test_mrr_exact.head()

Unnamed: 0,FAERS_drug_match,lookup_rank,reciprocal_rank
0,LAEGAL MARIJUAN,2.0,0.5
1,APAYA SEEDS ASZWAZANDHA,1.0,1.0
2,SERENOA REPENS PERMIXON,1.0,1.0
3,AWM PALMETTOSELENIUMLIBCOPENE,1.0,1.0
4,GREEN TEA COMLET,1.0,1.0


In [47]:
##get the mean of reciprocal ranks for exact matches
test_mrr_exact.reciprocal_rank.mean()

0.7006666666666667

In [48]:
#get median and stdev
test_mrr_exact.lookup_rank.median()

2.0

In [49]:
test_mrr_exact.reciprocal_rank.median()

0.5

In [50]:
test_mrr_exact.lookup_rank.std()

nan

In [51]:
test_mrr_exact.reciprocal_rank.std()

0.2923429477394154

In [52]:
rel_reciprocal = 1/test_mrr_rel['lookup_rank_related']
test_mrr_rel['reciprocal_rank'] = rel_reciprocal
test_mrr_rel.head()

Unnamed: 0,FAERS_drug_match,lookup_rank_related,reciprocal_rank
0,LAEGAL MARIJUAN,1.0,1.0
1,APAYA SEEDS ASZWAZANDHA,1.0,1.0
2,SERENOA REPENS PERMIXON,1.0,1.0
3,AWM PALMETTOSELENIUMLIBCOPENE,1.0,1.0
4,GREEN TEA COMLET,1.0,1.0


In [53]:
test_mrr_rel.reciprocal_rank.mean()

0.8633333333333333

In [54]:
test_mrr_rel.reciprocal_rank.median()

1.0

In [55]:
test_mrr_rel.reciprocal_rank.std()

0.23495548861997897

In [56]:
test_mrr_rel.lookup_rank_related.median()

1.0

In [57]:
test_mrr_rel.lookup_rank_related.std()

0.5439837932759934

--------------------------------------

# Translation tests

In [58]:
translation = '../data/translation_test_nps_202203171038.csv'

In [59]:
# test["Processed_np_name"] = test.np_name.apply(clean).apply(encode).apply(padding)

In [60]:
# test = preprocessInput(unmapped, converters = {"drug_name_original":str}, skip_blank_lines=True, na_filter=True, na_values="")
# test["Processed_drug_name_original"] = test[test.columns[0]].apply(clean).apply(encode).apply(padding)