# Imports & Globals

In [1]:
import numpy as np
import pandas as pd
import pickle

import re
import string

#import matplotlib.pyplot as plt
#import seaborn as sns
# %matplotlib inline

import tensorflow as tf
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# tf.enable_eager_execution()
# tf.executing_eagerly()


In [2]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

In [3]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = 400

## Data Processing Functions

In [4]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character. 
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
    print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        
        # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
        if r <= 0.3333: # edit
            w = w[:p] + random.choice(string.ascii_uppercase) + w[p+1:]
        elif r<= 0.6667: # delete
            w = w[:p] + w[p+1:]
        else: # add
            w = w[:p] + random.choice(string.ascii_uppercase) + w[p:]
    return w

In [5]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [6]:
def encode(x: string) -> [int]:
    '''Applies the encoding function to a given value.
    
    Returns these string enconded into an array containing an integer mapping to each character and space (1-66) separately.
    
    Parameters
    ----------
    x : string
        
      
    Returns
    -------
    x : [int]
        Returns the encoded string.

    '''
    return list(map(encode_dict.get, x))

In [7]:
def padding(x: [int], maxlen: int = MAXLEN) -> [int]:
    '''Applies the padding function to the encoded sequence.
    
    Returns the enconded Series padded.
    
    Parameters
    ----------
    x : [int]
       Encoded character sequence.
      
    Returns
    -------
    x : [int]
        Returns the padded encoded character sequence.
    '''
    return x + ([0]* (maxlen-len(x)))

------------------------------------------------

# Data loading and preprocessing

## Add additional negative pairs 
Generated from training data using generate_negative_pairs

In [11]:
# yTrainRNN.extend([0] * len(padded_xneg2))
# x1TrainRNN.extend(padded_xneg2)
# x2TrainRNN.extend(padded_yneg2)
# print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

In [12]:
# x1TrainRnnS, x1ValRnnS, x2TrainRnnS, x2ValRnnS, yTrainRnnS, yValRnnS = train_test_split(x1TrainRNN, x2TrainRNN, yTrainRNN, test_size=0.20, random_state=42)

In [13]:
# len(x1TrainRnnS)

# Load data from csv

In [14]:
if "x1TrainRnnS" not in locals():
    train = pd.read_pickle("../data/proccesed_train_set.pkl")
    x1TrainRnnS, x2TrainRnnS = train[["x1", "x2"]].applymap(lambda x: np.asarray(x).astype('float32')).values.T
    yTrainRnnS = train["y"].astype('float32')
    test = pd.read_pickle("../data/proccesed_test_set.pkl")
    x1ValRnnS, x2ValRnnS = test[["x1", "x2"]].applymap(lambda x: np.asarray(x).astype('float32')).values.T
    yValRnnS = test["y"].astype('float32')
    del train
    del test

In [17]:
dUnique_df = pd.read_pickle("../data/dUnique_df.pkl")
dfneg2 = pd.read_pickle("../data/dfneg2.pkl")

--------------------------------

# Build model, load weights and evaluate on test data

In [29]:
@tf.function(jit_compile=True)  # The decorator converts `cosine_similarity` into a tensolflow `Function`.
def cosine_similarity(vects: tf.TensorArray) -> tf.TensorArray:
    '''Cosine similarity to be calculated as sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
        
    
    Returns
    -------
    cosine_distance: tf.TensorArray
       The result of the cosine similarity between the vectors.    
    '''
    x, y = vects
    return tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True)))

In [30]:
@tf.function(jit_compile=True)  # The decorator converts `cosine_distance` into a tensolflow `Function`.
def cosine_distance(vects: tf.TensorArray) -> tf.TensorArray:
    '''Cosine distance to be calculated as 1-(cosine similarity).
    Where cosine similarity equals sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
        
    
    Returns
    -------
    cosine_distance: tf.TensorArray
        The result of 1-cosine similarity between the vectors. 
    '''
    x, y = vects
    return tf.math.subtract(tf.constant([1.0]), tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True))))

In [36]:
class ContrastiveLoss(tf.keras.losses.Loss):
    def __init__(self, margin: tf.float32 = 1.0):
        super(ContrastiveLoss, self).__init__()
        self.name = 'ContrastiveLoss'
        self.margin = tf.constant(margin)

    @tf.function(jit_compile=True)  # The decorator converts `loss` into a tensolflow `Function`.
    def call(self, y_true: tf.TensorArray, y_pred: tf.TensorArray) -> tf.Tensor:
        return tf.math.reduce_mean((1 - y_true) * tf.math.square(y_pred) + (y_true) * tf.math.square(tf.math.maximum(self.margin - (y_pred), 0.0)), axis = -1)

In [37]:
def build_model2(model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin):
    input_x = tf.keras.layers.Input(MAXLEN)
    input_1 = tf.keras.layers.Input(MAXLEN)
    input_2 = tf.keras.layers.Input(MAXLEN)
    embedding = tf.keras.layers.Embedding(input_dim=28, output_dim=embedding_dim, mask_zero=True)
    x = embedding(input_x)
    x = tf.keras.layers.BatchNormalization()(x)
    
    if model_type == "lstm":
        x = tf.keras.layers.LSTM(num_rnn_node)(x)
    elif model_type=="gru":
        x = tf.keras.layers.GRU(num_rnn_node)(x)
 
    num = num_dense_node
    for _ in range(num_layer):
        x = tf.keras.layers.Dense(num, activation=activation_fn)(x)
        num /= 2
        
    embedding_network = tf.keras.Model(input_x, x)

    tower_1 = embedding_network(input_1)
    tower_2 = embedding_network(input_2)

    merge_layer = tf.keras.layers.Lambda(cosine_similarity)([tower_1, tower_2])
    normal_layer = tf.keras.layers.BatchNormalization()(merge_layer)
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid")(normal_layer)
    contr = tf.keras.Model(inputs=[input_1, input_2], outputs=output_layer)
    
    if optimizer == "Adam":
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer =="RMSprop":                
        opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer =="SGD":                
        opt = tf.keras.optimizers.SGD(learning_rate=learning_rate)

    contr.compile(loss=ContrastiveLoss(margin = margin), optimizer=opt, metrics=["accuracy"])
    
    return contr

In [38]:
# model = build_model2(model_type = "lstm", embedding_dim = 256, num_rnn_node = 248, num_dense_node = 124, num_layer = 1, activation_fn = "tanh", learning_rate = 2e-4, optimizer= "Adam", margin = 0.8)

In [39]:
model = build_model2(
    model_type = "lstm", 
    embedding_dim = 10, 
    num_rnn_node = MAXLEN//2, 
    num_dense_node = 256, 
    num_layer = 1, 
    activation_fn = "tanh",
    learning_rate = 0.001,
    optimizer= "Adam",
    margin = 1.0
)
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_14 (InputLayer)          [(None, 400)]        0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 400)]        0           []                               
                                                                                                  
 model_7 (Functional)           (None, 256)          220576      ['input_14[0][0]',               
                                                                  'input_15[0][0]']               
                                                                                                  
 lambda_3 (Lambda)              (None, 1)            0           ['model_7[0][0]',          

In [40]:
checkpoint_filepath = "../ModelCheckpointSaves/maxlen-400/"
# model.load_weights("../exp3-hyperparameter-tuning/alstm-22-0.01.hdf5")
model.load_weights(checkpoint_filepath)
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_14 (InputLayer)          [(None, 400)]        0           []                               
                                                                                                  
 input_15 (InputLayer)          [(None, 400)]        0           []                               
                                                                                                  
 model_7 (Functional)           (None, 256)          220576      ['input_14[0][0]',               
                                                                  'input_15[0][0]']               
                                                                                                  
 lambda_3 (Lambda)              (None, 1)            0           ['model_7[0][0]',          

---------------------------------------------------------------

# Evaluation

In [9]:
def preprocessInput(filename: str) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    Returns a Pandas Dataframe containing uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename)
    print(df.info())
    
    print("Processing file: ---------------------------------------")
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen:")
    for column in df.columns:
        df.drop(df[df[column].apply(len).gt(MAXLEN)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    # Uppercase all values
    print("\tUppercasing string sequences.")
    df = df.applymap(lambda x: str.upper(x))
    print("Done processing: --------------------------------------")
    print(df.info())
    return df

## Unique target labels

In [10]:
dUnique_df.head()

NameError: name 'dUnique_df' is not defined

## True Positives

In [124]:
fName = '../data/NP_FAERS_mapped_20220215.csv'
matches = preprocessInput(fName)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ---------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 6 that exceeded the maximum sequence length.
	Uppercasing string sequences.
Done processing: --------------------------------------
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5352 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5352 non-null   object
 1   lookup_value      5352 non-null   object
dtypes: object(2)
memory usage: 125.4+ KB
None


In [129]:
matches["Processed_FAERS_drug_match"] = matches.FAERS_drug_match.apply(clean).apply(encode).apply(padding)
matches["Processed_lookup_value"] = matches.lookup_value.apply(clean).apply(encode).apply(padding)

In [133]:
matches.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value
0,"ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, R...",ASHWAGANDA,"[1, 4, 18, 5, 14, 1, 12, 27, 8, 5, 1, 12, 20, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
2,"ASHWAGANDHA,",ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
3,ASHWAGANDHA /01660201/,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
4,ASHWAGANDHA /01660201/,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."


### ASHWAGANDA

In [134]:
matches.loc[0, "FAERS_drug_match"]

'ADRENAL HEALTH (BRAND) = (SIBERIAN RHODIOLA, RHODIOLA, HOLY BASIL, SCHISANDRA BERRY, ASHWAGANDHA ROOT) PHOSPHORYLATED SERINE'

In [136]:
predicts = model.predict([np.tile(matches.loc[0, "Processed_FAERS_drug_match"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])])
argsort = predicts.flatten().argsort()



In [137]:
# Top-5 smalles distances
{
    "rank1": dUnique_df['dUnique_label'][argsort == 0].item(),
    "rank2": dUnique_df['dUnique_label'][argsort == 1].item(),
    "rank3": dUnique_df['dUnique_label'][argsort == 2].item(),
    "rank4": dUnique_df['dUnique_label'][argsort == 3].item(),
    "rank5": dUnique_df['dUnique_label'][argsort == 4].item(),
}

{'rank1': 'ASHWAGANDA',
 'rank2': 'EQUISETUM HYEMALE',
 'rank3': 'CATSCLAW',
 'rank4': 'MORINGA OLEIFERA',
 'rank5': 'GARLIC'}

### Echinacea

In [119]:
clean(dfneg2["FAERS_drug_match"][10])

'ECHINACEA ECHINACEA PURPUREA'

In [148]:
dUnique_df[dUnique_df["dUnique_label"] == "ECHINACEA"]

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
6,ECHINACEA,"[5, 3, 8, 9, 14, 1, 3, 5, 1]","[5, 3, 8, 9, 14, 1, 3, 5, 1, 0, 0, 0, 0, 0, 0,..."


In [154]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["FAERS_drug_match"][10]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])])
argsort = predicts.flatten().argsort()
# Top-5 smalles distances
{
    "rank1": dUnique_df['dUnique_label'][argsort == 0].item(),
    "rank2": dUnique_df['dUnique_label'][argsort == 1].item(),
    "rank3": dUnique_df['dUnique_label'][argsort == 2].item(),
    "rank4": dUnique_df['dUnique_label'][argsort == 3].item(),
    "rank5": dUnique_df['dUnique_label'][argsort == 4].item(),
}



{'rank1': 'TANACETUM PARTHENIUM',
 'rank2': 'MATRICARIA CHAMOMILLA',
 'rank3': 'CRANBERRY',
 'rank4': 'SLIPPERY ELM',
 'rank5': 'CITRUS PARADISI'}

### Cranberry

In [141]:
clean(dfneg2["FAERS_drug_match"][0])

'NATURES BOUNTY CRANBERRYTHERAPY'

In [140]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["FAERS_drug_match"][0]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])])
argsort = predicts.flatten().argsort()
# Top-5 smalles distances
{
    "rank1": dUnique_df['dUnique_label'][argsort == 0].item(),
    "rank2": dUnique_df['dUnique_label'][argsort == 1].item(),
    "rank3": dUnique_df['dUnique_label'][argsort == 2].item(),
    "rank4": dUnique_df['dUnique_label'][argsort == 3].item(),
    "rank5": dUnique_df['dUnique_label'][argsort == 4].item(),
}

{'rank1': 'CRANBERRY',
 'rank2': 'FOENICULUM VULGARE',
 'rank3': 'GARCINIA GUMMI',
 'rank4': 'CAMELLIA SINENSIS',
 'rank5': 'CRATAEGUS LAEVIGATA'}

## True  Negatives

In [44]:
dfneg2.head()

Unnamed: 0,FAERS_drug_match,lookup_value
0,NATURES BOUNTY CRANBERRY-THERAPY,HORSECHESTNUT
1,NATURES BOUNTY CRANBERRY-THERAPY,CITRUS PARADISI
2,NATURES BOUNTY CRANBERRY-THERAPY,BLACK COHOSH
3,NATURES BOUNTY CRANBERRY-THERAPY,ACTAEA RACEMOSA
4,"SPASMO-URGENIN [ECHINACEA ANGUSTIFOLIA,SERENOA...",GUARANA


# Evaluating on test data - NP names only

In [164]:
vocab = pd.read_csv('../data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


In [12]:
translation = '../data/translation_test_nps_202203171038.csv'
# test = pd.read_csv(translation).applymap(clean)
test = preprocessInput(translation)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5950 entries, 0 to 5949
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   np_name  5950 non-null   object
dtypes: object(1)
memory usage: 46.6+ KB
None
Processing file: ---------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 0 that exceeded the maximum sequence length.
	Uppercasing string sequences.
Done processing: --------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5950 entries, 0 to 5949
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   np_name  5950 non-null   object
dtypes: object(1)
memory usage: 46.6+ KB
None


In [247]:
# original_count = len(test.index)
# print("Dropping sequences longer than the maxlen:")
# test.drop(test[test[test.columns[0]].apply(len).gt(MAXLEN)].index, inplace = True)
# new_count = len(test.index)
# print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")

Dropping sequences longer than the maxlen:
	Dropped 0 that exceeded the maximum sequence length.


In [249]:
test["Processed_np_name"] = test.np_name.apply(clean).apply(encode).apply(padding)

In [250]:
test = test.assign(rank1_drug="", rank2_drug="", rank3_drug="", rank4_drug="", rank5_drug="", lookup_rank= np.Inf, lookup_rank_related = np.Inf)
test.head()

Unnamed: 0,np_name,Processed_np_name,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,lookup_rank_related
0,XTNNP,"[24, 20, 14, 14, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...",,,,,,inf,inf
1,NWLMKV,"[14, 23, 12, 13, 11, 22, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,inf,inf
2,WZMFCEA,"[23, 26, 13, 6, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,inf,inf
3,AARONSROD,"[1, 1, 18, 15, 14, 19, 18, 15, 4, 0, 0, 0, 0, ...",,,,,,inf,inf
4,AARONS ROD WHOLE,"[1, 1, 18, 15, 14, 19, 27, 18, 15, 4, 27, 23, ...",,,,,,inf,inf


# Evaluation of drug name predictions
### Find ranks 1-n from the predicted similarities for the test data

In [251]:
test.head()

Unnamed: 0,np_name,Processed_np_name,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,lookup_rank_related
0,XTNNP,"[24, 20, 14, 14, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...",,,,,,inf,inf
1,NWLMKV,"[14, 23, 12, 13, 11, 22, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,inf,inf
2,WZMFCEA,"[23, 26, 13, 6, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, ...",,,,,,inf,inf
3,AARONSROD,"[1, 1, 18, 15, 14, 19, 18, 15, 4, 0, 0, 0, 0, ...",,,,,,inf,inf
4,AARONS ROD WHOLE,"[1, 1, 18, 15, 14, 19, 27, 18, 15, 4, 27, 23, ...",,,,,,inf,inf


In [276]:
def find_ranks(model: tf.keras.Model, related_rank: bool = False) -> pd.DataFrame:
    """For each row in the test set (input), use the model to predict if the 'FAERS_drug_match' entry matches any of the 'lookup_value' entries.
       This is done at the encoded sequence level for both name all unique drugnames
       
         Parameters
    ----------
    model : tf.keras.Model
        A Keras model based Siamese Network that takes three inputs. 
        Namely, two input sequeces and a third input binary target specifying wether the two sequeces match.
    y : pd.Series
        A pandas Series containing the clean encoded 'lookup_value' column.
      
    Returns
    -------
    x : pd.Series
        Returns the padded 'FAERS_drug_match' series.
    y : pd.Series 
        Returns the padded 'lookup_value' series.
    
    """
    subsample = test.sample(frac=0.01)
    for i in subsample.index:
        predicts = model.predict([np.tile(subsample.loc[i, "Processed_np_name"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])])
        argsort = predicts.flatten().argsort()
        # Top-5 smalles distances
        subsample.at[i, 'rank1_drug'] = dUnique_df['dUnique_label'][argsort == 0].values[0]
        subsample.at[i, 'rank2_drug'] = dUnique_df['dUnique_label'][argsort == 1].values[0]
        subsample.at[i, 'rank3_drug'] = dUnique_df['dUnique_label'][argsort == 2].values[0]
        subsample.at[i, 'rank4_drug'] = dUnique_df['dUnique_label'][argsort == 3].values[0]
        subsample.at[i, 'rank5_drug'] = dUnique_df['dUnique_label'][argsort == 4].values[0]
        if related_rank:
            # Does any of them match
            lookup_clean = subsample.loc[i]['lookup_value']
            predicted_rank = subsample.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_clean).to_numpy().nonzero()

            lookup_rank = np.Inf    
            if len(predicted_rank[0]) > 0 :
                lookup_rank = predicted_rank[0][0] + 1
                test.loc[i, 'lookup_rank'] = lookup_rank

            # Let's compare to latin binomial
            lb_res = vocab.loc[vocab['latin_binomial_clean'] == lookup_clean]
            common_res = vocab.loc[vocab['common_name_clean'] == lookup_clean]
            lookup_result = ''
            if len(lb_res) > 0:
                lookup_result = lb_res.common_name_clean.values[0]
            elif len(common_res) > 0:
                lookup_result = common_res.latin_binomial_clean.values[0]

            related_rank = np.Inf
            if lookup_result != '':
                annotated_rank = test.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_result).to_numpy().nonzero()
                if len(annotated_rank[0]) > 0: 
                    related_rank = annotated_rank[0][0] + 1


            #find related mappings to lookup value in predicted values 
            test.loc[i, 'lookup_rank_related'] = min(lookup_rank, related_rank)
    
    test.head()
    return subsample

In [277]:
# i = padded_xTest.index[0]
# predicts = model.predict([np.tile(padded_xTest.loc[i], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])])
# argsort = predicts.flatten().argsort()
# predicts.flatten()
# argsort
# dUnique_df[['dUnique_label','dUnique_seq_padded']]
# dUnique_df.iloc[6][['dUnique_label','dUnique_seq_padded']]
# dUnique_df['dUnique_label'][np.where(argsort == 3)[0][0]]

## Assing ranks to the matching 
matches are assigned their corresponding rank
non-matches are left null

In [278]:
predicted = find_ranks(model, False)



In [288]:
dUnique_df[dUnique_df["dUnique_label"] == "Dill".upper()]

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded


In [287]:
dUnique_df[dUnique_df["dUnique_label"] == "Anethum graveolens".upper()]

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded


In [289]:
predicted.head(20)

Unnamed: 0,np_name,Processed_np_name,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,lookup_rank,lookup_rank_related
4935,SELINUM ANETHUM WHOLE,"[19, 5, 12, 9, 14, 21, 13, 27, 1, 14, 5, 20, 8...",WITHANIA SOMNIFERA,STEVIA,HORDEUM VULGARE,GUARANA,PANAX GINSENG,inf,inf
4401,PSEUDOTSUGA VANCOUVERENSIS WHOLE,"[16, 19, 5, 21, 4, 15, 20, 19, 21, 7, 1, 27, 2...",CORDYCEPS,STEVIA,CITRUS PARADISI,KRATOM,MILK THISTLE,inf,inf
3661,MELEGUETA PEPPER WHOLE,"[13, 5, 12, 5, 7, 21, 5, 20, 1, 27, 16, 5, 16,...",CHLORELLA VULGARIS,CINNAMON,GARCINIA GUMMI,GOJI BERRY,VALERIANA OFFICINALIS,inf,inf
1685,CITRUS X AURANTIUM L,"[3, 9, 20, 18, 21, 19, 27, 24, 27, 1, 21, 18, ...",WITHANIA SOMNIFERA,PAUSINYSTALIA JOHIMBE,WOOD SPIDER,RHODIOLA ROSEA,SILYBUM MARIANUM,inf,inf
4306,POGOSTEMON PATCHOULY WHOLE,"[16, 15, 7, 15, 19, 20, 5, 13, 15, 14, 27, 16,...",TRITICUM AESTIVUM,MARRUBIUM VULGARE,OLIVE LEAF,BLACK CUMIN,LEPIDIUM MEYENII,inf,inf
3327,LEUCONYMPHAEA ODORATA WHOLE,"[12, 5, 21, 3, 15, 14, 25, 13, 16, 8, 1, 5, 1,...",MARRUBIUM VULGARE,BEET ROOT,SLIPPERY ELM,FENNEL,CRATAEGUS LAEVIGATA,inf,inf
1711,CITRUS X PARADISI,"[3, 9, 20, 18, 21, 19, 27, 24, 27, 16, 1, 18, ...",SERENOA REPENS,FOENICULUM VULGARE,STEVIA REBAUDIANA,HEMP EXTRACT,RUSCUS ACULEATUS,inf,inf
1126,CAPRIFOLIUM JAPONICUM WHOLE,"[3, 1, 16, 18, 9, 6, 15, 12, 9, 21, 13, 27, 10...",STEVIA REBAUDIANA,HORSETAIL,FEVERFEW,IVY LEAF,UNCARIA TOMENTOSA,inf,inf
956,BUPLEURUM VANHEURCKII WHOLE,"[2, 21, 16, 12, 5, 21, 18, 21, 13, 27, 22, 1, ...",FEVERFEW,STEVIA,ASHWAGANDA,BUTCHERSBROOM,TANACETUM PARTHENIUM,inf,inf
612,BERBERIS AUREA WHOLE,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 1, 21, 18, 5, ...",CANNABIS SATIVA,STEVIA,WOOD SPIDER,PIPER METHYSTICUM,BLACK CUMIN,inf,inf


In [None]:
# test.to_csv("../evaluation/test_siamese_evaluation_lstm_model_np_name.csv", index=False)

# Add related mappings rank to test set evaluation

In [None]:
if "test" not in locals():
    test = pd.read_csv('../evaluation/test_siamese_evaluation_lstm_model_np_name.csv')
    test.info()

In [None]:
test_mrr = test[['FAERS_drug_match', 'lookup_rank', 'lookup_rank_related']]
test_mrr.head()

In [None]:
test_mrr.info()

In [None]:
#number of 0 relevant results
test_mrr.loc[test_mrr['lookup_rank'].isna()].shape

In [None]:
test_mrr.loc[test_mrr['lookup_rank_related'].isna()].shape

In [None]:
test_mrr_exact = test_mrr[test_mrr['lookup_rank'].notna()]
test_mrr_exact = test_mrr_exact.drop(['lookup_rank_related'], axis=1)
test_mrr_exact.info()

In [None]:
test_mrr_rel = test_mrr[test_mrr['lookup_rank_related'].notna()]
test_mrr_rel = test_mrr_rel.drop(['lookup_rank'], axis=1)
test_mrr_rel.info()

In [None]:
exact_reciprocal = 1/test_mrr_exact['lookup_rank']
test_mrr_exact['reciprocal_rank'] = exact_reciprocal
test_mrr_exact.head()

In [None]:
##get the mean of reciprocal ranks for exact matches
test_mrr_exact.reciprocal_rank.mean()

In [None]:
#get median and stdev
test_mrr_exact.lookup_rank.median()

In [None]:
test_mrr_exact.reciprocal_rank.median()

In [None]:
test_mrr_exact.lookup_rank.std()

In [None]:
test_mrr_exact.reciprocal_rank.std()

In [None]:
rel_reciprocal = 1/test_mrr_rel['lookup_rank_related']
test_mrr_rel['reciprocal_rank'] = rel_reciprocal
test_mrr_rel.head()

In [None]:
test_mrr_rel.reciprocal_rank.mean()

In [None]:
test_mrr_rel.reciprocal_rank.median()

In [None]:
test_mrr_rel.reciprocal_rank.std()

In [None]:
test_mrr_rel.lookup_rank_related.median()

In [None]:
test_mrr_rel.lookup_rank_related.std()

--------------------------------------

# Average NP name length 

In [None]:
main_dataset = preprocessInput(fName)

In [None]:
stats = main_dataset['FAERS_drug_match'].apply(len).describe()
stats

In [None]:
stats["mean"] + stats["std"] * 2

In [None]:
main_dataset['lookup_value'].apply(len).describe()

In [None]:
main_dataset['FAERS_drug_match'].apply(len).sort_values(ascending=False)[0:10]

In [None]:
main_dataset['FAERS_drug_match'].apply(len).gt(80).describe()

In [None]:
print("Gt 65:", 5358 - 4984)
print("Gt 70:", 5358 - 5212)
print("Gt 80:", 5358 - 5238)

In [None]:
%matplotlib inline
main_dataset['FAERS_drug_match'].apply(len).hist(bins=15)

In [None]:
main_dataset.iloc[5132]['FAERS_drug_match']

In [None]:
main_dataset.iloc[5132]['FAERS_drug_match']

In [None]:
dfneg2['FAERS_drug_match'].apply(len).describe()

In [None]:
dfneg2['lookup_value'].apply(len).describe()

In [None]:
dUnique_df['dUnique_label'].apply(len).describe()

In [None]:
dUnique_df['dUnique_seq'].apply(len).describe()

In [None]:
dfneg2['FAERS_drug_match'].apply(len).idxmax()

In [None]:
dfneg2.iloc[8183]['FAERS_drug_match']