# Imports & Globals

In [1]:
import numpy as np
import pandas as pd
import pickle
from yaml import safe_load
from difflib import get_close_matches
from rapidfuzz.distance import Levenshtein

import re
import string
from typing import Literal

#import matplotlib.pyplot as plt
#import seaborn as sns
# %matplotlib inline

In [2]:
import tensorflow as tf
#@title Versions:
print("tf.version: ", tf.version.VERSION)
print("tf.keras.version: ", tf.keras.__version__)

tf.version:  2.9.1
tf.keras.version:  2.9.0


In [3]:
# from tensorflow.python.ops.numpy_ops import np_config
# np_config.enable_numpy_behavior()
# tf.enable_eager_execution()

In [4]:
# Check that GPU is available: cf. https://colab.research.google.com/notebooks/gpu.ipynb
# assert(tf.test.gpu_device_name())

# tf.keras.backend.clear_session()
# tf.config.optimizer.set_jit(True) # Enable XLA.

In [5]:
tf.executing_eagerly()

True

In [6]:
with open('LSTM65.yaml', 'r') as file:
    model_config = safe_load(file)

In [7]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = model_config['MAXLEN']

In [8]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

## Data Processing Functions

In [9]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character or space.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character or space.
    
    In order to retain the length of the sequence compliant with the maximum sequence length,
    additional processing has been added such that sequences that reach the maximum sequence length
    can only be modified by removing or swapping characters.
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
#     print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        if len(w) < MAXLEN:
            # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
            if r <= 0.3333: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            elif r<= 0.6667: # delete
                w = w[:p] + w[p+1:]
            else: # add
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p:]
        else:
            if r <= 0.5: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            else: # delete
                w = w[:p] + w[p+1:]
            
    return w

In [10]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [11]:
def encode(x: string) -> [int]:
    '''Applies the encoding function to a given value.
    
    Returns these string enconded into an array containing an integer mapping to each character and space (1-66) separately.
    
    Parameters
    ----------
    x : string
        
      
    Returns
    -------
    x : [int]
        Returns the encoded string.

    '''
    return list(map(encode_dict.get, x))

In [12]:
def padding(x: [int], maxlen: int = MAXLEN) -> [int]:
    '''Applies the padding function to the encoded sequence.
    
    Returns the enconded Series padded.
    
    Parameters
    ----------
    x : [int]
       Encoded character sequence.
      
    Returns
    -------
    x : [int]
        Returns the padded encoded character sequence.
    '''
    return x + ([0]* (maxlen-len(x)))

In [13]:
def preprocessInput(filename: str, maxlen: int = MAXLEN, **kwargs) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    It then removes the sequences with length greater than the maximun sequence length, cleans the sequences and
    uppercases them, and it finally drops any duplicates that might have arrisen from this processing.
    Returns a Pandas Dataframe containing unique cleaned and uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename, **kwargs)
    print(df.info())
    
    
    print("Processing file: ----------------------------------------")
    
    print("Renaming colums:")
    print("\tCurrent names: {}".format(df.columns))
    cols = df.columns.size
    match cols:
        case 1: 
            df.columns = ["x"]
        case 2:
            df.columns = ["x", "y"]
    print("\tNew names: {}".format(df.columns))
    
    original_count = len(df.index)
    print("Dropping row with empty cells:")
    df.dropna(subset=df.columns, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "rows with empty cells.")
    
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen of {}:".format(maxlen))
    for column in df.columns:
        df.drop(df[df[column].apply(len).gt(maxlen)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    
    print("\tCleaning string sequences.")
    df = df.applymap(clean)
    
    print("\tUppercasing string sequences.")
    df = df.applymap(lambda x: str.upper(x))
    
    print("Dropping duplicate sequences:")
    original_count = len(df.index)
    df.drop_duplicates(ignore_index=True, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "duplicate sequences.")
    
    print("Done processing: ---------------------------------------")
    print(df.info())
    return df

In [14]:
def encode_pad_tag(df: pd.DataFrame, match: Literal[0,1], maxlen: int = MAXLEN) -> pd.DataFrame:
    '''It encodes, pads and tags the preprocessed sequences in a Pandas DataFrame.
    
    Expects a pandas dataframe with cleaned and uppercased sequences. It processes the 
    the DataFrame by creating an additional 'Processed_' + current column name for each
    of the columns in the data frame, where each of the sequences in the column get 
    transformed from a string sequence to an encoded sequence and then transformed again 
    by padding the encoded sequences up to the maximun sequence length by 0's as needed. 
    Finally, this function returns this dataframe with both the original and processed columns.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing sequences.
    
    match: Literal[0 | 1]
        Tag indicating wether sequences match 1 indicates 'yes' and 0 indicates 'no'. 
        
    maxlen: int
        Dafault to global MAXLEN value. It's used to determine the ammount of padding to add
        to sequences smaller than maxlen.
        
    Returns
    -------
    df : Pandas DataFrame
        A copy of the origininal DataFrame with the processed sequences added as new columns.
    '''  
    print("Encoding and Padding: ----------------------------------")
    for column in df.columns:
        print("\tProcessing {}".format(column))
        df["Processed_" + column] = df[column].apply(lambda string: list(map(encode_dict.get, string))).transform(lambda x: x + ([0]* (maxlen-len(x))))
    print("Tagging: -----------------------------------------------")
    df["Match"] = match
    return df

------------------------------------------------

# Data loading and preprocessing

# Pickled Datasets

In [15]:
dUnique_df = pd.read_pickle("../data/dUnique_df.pkl")
dfneg2 = pd.read_pickle("../data/dfneg2.pkl")
test = pd.read_pickle("../data/test.pkl")
validate = pd.read_pickle("../data/validate.pkl")

--------------------------------

# Build model, load weights and evaluate on test data

In [16]:
class CosineSimilarity(tf.keras.layers.Layer):
    '''Cosine similarity to be calculated as sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
    
    Returns
    -------
    cosine_similarity: tf.TensorArray
       The result of the cosine similarity between the vectors.    
    '''
    __name__ = 'CosineSimilarity'
    def __init__(self, **kwargs):
        super(CosineSimilarity, self).__init__()
       
    @tf.function  # The decorator converts `cosine_similarity` into a tensolflow `Function`.
    def call(self, vects: tf.TensorArray) -> tf.TensorArray:
        x, y = vects
        return tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True)))

    def get_config(self):
        return super(CosineSimilarity, self).get_config()
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [17]:
class ContrastiveLoss(tf.keras.losses.Loss):
    '''Returns a value between 0 and 1 representing the average error of the y_pred vector by comparing it to the y_true.
    '''
    __name__ = 'ContrastiveLoss'
    def __init__(self, margin: tf.float32 = 1.0, **kwargs):
        super(ContrastiveLoss, self).__init__()
        self.margin = tf.constant(margin)
        
    @tf.function  # The decorator converts `loss` into a tensolflow `Function`.
    def call(self, y_true: tf.TensorArray, y_pred: tf.TensorArray) -> tf.Tensor:
        return tf.math.reduce_mean((1 - y_true) * tf.math.square(y_pred) + (y_true) * tf.math.square(tf.math.maximum(self.margin - (y_pred), 0.0)), axis = -1)
    
    def get_config(self):
        config = super(ContrastiveLoss, self).get_config()
        config.update({
            "margin": str(self.margin)
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [18]:
model = tf.keras.models.load_model(
    './saved_models/{}_extended'.format(model_config["Model_Name"]), 
    custom_objects = {
        'CosineSimilarity': CosineSimilarity,
        'ContrastiveLoss': ContrastiveLoss
    }, 
    compile=True, 
    options=None
)
model.summary()

2022-08-26 13:34:24.287809: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2022-08-26 13:34:24.287848: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: Raza
2022-08-26 13:34:24.287853: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: Raza
2022-08-26 13:34:24.287944: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 515.65.1
2022-08-26 13:34:24.287958: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 515.65.1
2022-08-26 13:34:24.287961: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 515.65.1
2022-08-26 13:34:24.288133: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations

2022-08-26 13:34:30.985373: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-26 13:34:30.995008: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-26 13:34:31.110459: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-26 13:34:31.120153: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-26 13:34:31.464214: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes at

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 model (Functional)             (None, 100)          109220      ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

---------------------------------------------------------------

# Evaluation

## Unique target labels

In [19]:
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq_padded
0,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,BUTCHERSBROOM,"[2, 21, 20, 3, 8, 5, 18, 19, 2, 18, 15, 15, 13..."
2,CATSCLAW,"[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,..."
3,CINNAMON,"[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0..."
4,FENUGREEK,"[6, 5, 14, 21, 7, 18, 5, 5, 11, 0, 0, 0, 0, 0,..."


In [20]:
dUnique_df.size

1578

## True Positives

In [21]:
matches = encode_pad_tag(preprocessInput('../data/NP_FAERS_mapped_20220215.csv'), 1)
matches.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['FAERS_drug_match', 'lookup_value'], dtype='object')
	New names: Index(['x', 'y'], dtype='object')
Dropping row with empty cells:
	Dropped 0 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 374 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 482 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 2 columns):
 #   Column

Unnamed: 0,x,y,Processed_x,Processed_y,Match
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1


### ASHWAGANDA

In [22]:
matches.loc[1, "x"]

'ASHWAGANDHA EXTRACT'

In [23]:
predicts = model.predict([np.tile(matches.loc[1, "Processed_x"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()



In [24]:
# Top-5 smalles distances
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ASHWAGANDA',
 'rank2': 'SERENOA REPENS',
 'rank3': 'SCRUBPALMETTO',
 'rank4': 'PANAX GINSENG',
 'rank5': 'WITHANIA SOMNIFERA'}

### Echinacea

In [25]:
Echinacea = dfneg2[dfneg2["x"].str.contains("ECHINACEA")].head(1)
Echinacea

Unnamed: 0,x,y,Processed_x,Processed_y,Match
393,ECHINACEA TEA ECHINACEA PURPUREA,GREEN TEA,"[5, 3, 8, 9, 14, 1, 3, 5, 1, 27, 20, 5, 1, 27,...","[7, 18, 5, 5, 14, 27, 20, 5, 1, 0, 0, 0, 0, 0,...",0


In [26]:
i = Echinacea.index.values[0]
clean(dfneg2["x"][i])

'ECHINACEA TEA ECHINACEA PURPUREA'

In [27]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["x"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()



In [28]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ECHINACEA PURPUREA',
 'rank2': 'ECHINACEA',
 'rank3': 'BRASSICA BRASSICA OLERACEA',
 'rank4': 'ESCHSCHOLZIA CALIFORNICA',
 'rank5': 'ECHINACEA ANGUSTIFOLIA'}

### Cranberry

In [29]:
cranberry = dUnique_df[dUnique_df["dUnique_label"].str.contains("CRANBERRY")].head(1)
cranberry

Unnamed: 0,dUnique_label,dUnique_seq_padded
58,CRANBERRY,"[3, 18, 1, 14, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0..."


In [30]:
i = cranberry.index.values[0]
clean(dUnique_df["dUnique_label"][i])

'CRANBERRY'

In [31]:
predicts = model.predict([np.tile(padding(encode(clean(dUnique_df["dUnique_label"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()
# argsort = predicts.flatten().argsort()



In [32]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'VACCINIUM MYRSINITES',
 'rank2': 'VACCINIUM MACROCARPON',
 'rank3': 'BARBERRY',
 'rank4': 'CRANBERRY',
 'rank5': 'GINGER'}

## True  Negatives

In [33]:
dfneg2.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0


# Evaluating on test data - NP names only

In [34]:
vocab = pd.read_csv('../data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


# Create a sample from the test set to evaluate

In [36]:
positive_pairs = test.loc[test["Match"] == 1].sample(n=1000)

# Evaluation of drug name predictions
### Find ranks 1-n from the predicted similarities for the test data

In [35]:
def find_ranks(model: tf.keras.Model, df: pd.DataFrame, find_related_rank: bool = False, report_distances: bool = False) -> pd.DataFrame:
    """For each row in the input dataframe, the model is used to predict the top matching Unique Product Names 
    in 'x' against the entry matches any of the 'y' entries.
    This is done at the encoded sequence level for both name all unique drugnames
       
         Parameters
    ----------
    model : tf.keras.Model
        A Keras model based Siamese Network that takes three inputs. 
        Namely, two input sequeces and a third input binary target specifying wether the two sequeces match.
    
    find_related_rank: bool
        A flag indicating wether to compare the top ranked results against the 'y' and it's potential equivalents or not.
        
    report_distances: bool
        A flag indicating return the distance values of the top ranked results against the 'y'.
      
    Returns
    -------
    df : pd.DataFrame
        Returns the padded 'x', 'y', 'rank1', 'rank2', 'rank3', 'rank4', 'rank5' series.
        And additionally the 'exact_rank' and 'equivalent_rank' series and the 'rank1_distance', 'rank2_distance', 
        'rank3_distance', 'rank4_distance', 'rank5_distance' if requested.
    
    """
    
    print("Using column: ", df.columns[0])
    df.assign(rank1="", rank2="", rank3="", rank4="", rank5="")
    
    if report_distances:
        df.assign(rank1_distance=np.Inf, rank2_distance=np.Inf, rank3_distance=np.Inf, rank4_distance=np.Inf, rank5_distance=np.Inf)
 
    if find_related_rank:
        df.assign(exact_rank= np.Inf, equivalent_rank = np.Inf)
    
    for i in df.index:
        predicts = model.predict([np.tile(df.loc[i, "Processed_"+ df.columns[0]], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
        argsort = predicts.flatten().argsort()
        # Top-5 smalles distances
        df.at[i, 'rank1'] = dUnique_df['dUnique_label'][argsort[-1]]  
        df.at[i, 'rank2'] = dUnique_df['dUnique_label'][argsort[-2]]
        df.at[i, 'rank3'] = dUnique_df['dUnique_label'][argsort[-3]]
        df.at[i, 'rank4'] = dUnique_df['dUnique_label'][argsort[-4]]
        df.at[i, 'rank5'] = dUnique_df['dUnique_label'][argsort[-5]]
        
        if report_distances:
            df.at[i, 'rank1_distance'] = predicts[argsort[-1]]  
            df.at[i, 'rank2_distance'] = predicts[argsort[-2]]
            df.at[i, 'rank3_distance'] = predicts[argsort[-3]]
            df.at[i, 'rank4_distance'] = predicts[argsort[-4]]
            df.at[i, 'rank5_distance'] = predicts[argsort[-5]]

        if find_related_rank:
            # Find the top-5 predicted matches
            lookup_clean = clean(df.at[i , df.columns[1]])
            predicted_rank = df.loc[i, ['rank1', 'rank2', 'rank3', 'rank4', 'rank5']].eq(lookup_clean).to_numpy().nonzero()
            
            # Find the top ranking correct match, if not rank is infinity so that 1/inf ~ 0, for the MRR computation.
            lookup_rank = np.Inf    
            if len(predicted_rank[0]) > 0 :
                lookup_rank = predicted_rank[0][0] + 1
            df.loc[i, "exact_rank"] = lookup_rank

            # Find all the equivalent common names and latin binomials relative to the look up value that would be equaly correct.
            equivalent = np.setdiff1d(vocab[["latin_binomial_clean","common_name_clean"]][(vocab["latin_binomial_clean"] == lookup_clean) | (vocab["common_name_clean"] == lookup_clean)].unstack().unique(), lookup_rank)

            # Find the top ranking correct match 
            related_rank = np.Inf
            if len(equivalent) > 0:
                for lookup_result in equivalent:
                    annotated_rank = df.loc[i][['rank1', 'rank2', 'rank3', 'rank4', 'rank5']].eq(lookup_result).to_numpy().nonzero()
                    if len(annotated_rank[0]) > 0: 
                        new_related_rank = annotated_rank[0][0] + 1
                        related_rank = min(related_rank, new_related_rank)

            #find related mappings to lookup value in predicted values 
            df.loc[i, 'equivalent_rank'] = min(lookup_rank, related_rank)
            
    return df

## Assing ranks to the matching 
matches are assigned their corresponding rank
non-matches are left null

In [37]:
predicted = find_ranks(model, positive_pairs,  True, True)
predicted.head(5)

Using column:  x














Unnamed: 0,x,y,Processed_x,Processed_y,Match,rank1,rank2,rank3,rank4,rank5,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance,exact_rank,equivalent_rank
42226,CINNOMOR,CINNAMON,"[3, 9, 14, 14, 15, 13, 15, 18, 0, 0, 0, 0, 0, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",1,ZINGIBER OFFICINALE,CINNAMON,CINNAMOMUM VERUM,CINNAMOMUM CAMPHORA,GINGER,1.0,1.0,1.0,1.0,0.911233,2.0,2.0
52199,FLAXEEDKTRIMJROSE OIL,FLAX SEED,"[6, 12, 1, 24, 5, 5, 4, 11, 20, 18, 9, 13, 10,...","[6, 12, 1, 24, 27, 19, 5, 5, 4, 0, 0, 0, 0, 0,...",1,GANODERMA LUCIDUM,FLAX SEED,LINUM USITATISSIMUM,SMILAX PSEUDOCHINA,EVENING PRIMROSE OIL,1.0,1.0,1.0,1.0,0.84658,2.0,2.0
58919,VALERIAN DROGPS,VALERIAN,"[22, 1, 12, 5, 18, 9, 1, 14, 27, 4, 18, 15, 7,...","[22, 1, 12, 5, 18, 9, 1, 14, 0, 0, 0, 0, 0, 0,...",1,VERONICA CHAMAEDRYS,VALERIAN,VALERIANA OFFICINALIS,GARDENIA THUNBERGIA,GOJI BERRY,1.0,1.0,1.0,1.0,1.0,2.0,2.0
61029,ELERBERRY SYU,SAMBUCUS NIGRA,"[5, 12, 5, 18, 2, 5, 18, 18, 25, 27, 19, 25, 2...","[19, 1, 13, 2, 21, 3, 21, 19, 27, 14, 9, 7, 18...",1,SAMBUCUS CANADENSIS,SAMBUCUS EBULUS,SAURURUS CERNUUS,ELDERBERRY,SAMBUCUS NIGRA,1.0,1.0,1.0,1.0,1.0,5.0,5.0
1583,TURMRNIC,TURMERIC,"[20, 21, 18, 13, 18, 14, 9, 3, 0, 0, 0, 0, 0, ...","[20, 21, 18, 13, 5, 18, 9, 3, 0, 0, 0, 0, 0, 0...",1,CURCUMA LONGA,TURMERIC,CURCUMA ZEDOARIA,KARCURA,QUERCUS GAMBELII,1.0,1.0,1.0,0.864708,0.817935,2.0,1.0


----------------------------------------------------

# Predicted Match MRR Evaluation

In [38]:
models_mrr = predicted.loc[:,['x', 'exact_rank', 'equivalent_rank']]
models_mrr.loc[:,'exact_reciprocal_rank'] = 1/models_mrr.loc[:, 'exact_rank']
models_mrr.loc[:,'equivalent_reciprocal_rank'] = 1/models_mrr.loc[:, 'equivalent_rank']
models_mrr[['exact_reciprocal_rank', 'equivalent_reciprocal_rank']].describe()

In [41]:
#get median and stdev
models_mrr['exact_rank'].median(), models_mrr['exact_rank'].std()

(3.0, nan)

In [42]:
#get median and stdev
models_mrr['equivalent_rank'].median(), models_mrr['equivalent_rank'].std()

(2.0, nan)

In [43]:
1/models_mrr.size  * models_mrr['exact_reciprocal_rank'].sum()

0.08838333333333333

In [44]:
1/models_mrr.size  * models_mrr['equivalent_reciprocal_rank'].sum()

0.11922666666666668

--------------------------------------

# Comparison with fuzzy string match

In [45]:
def find_Gesalt_fuzzy_lookup_and_related_rank(df: pd.DataFrame, find_related_rank: bool = False) -> pd.DataFrame:
    """For each row in the input data frame, this function utilizes the difflib implementation of fuzzy string match
       to find the top 5 unique natural product names that match the row's 'x' string value.
       
       
    Parameters
    ----------
    df: pd.DataFrame
        A pandas dataframe with the fist column containing 'x' strings to be matched against natural product names.
        Optionally containing Pandas Series with the clean encoded 'y' column.
    
    find_related_rank: bool
        A flag indicating wether to compare the top ranked results against the 'y' and it's potential equivalents or not.
      
    Returns
    -------
    df : pd.DataFrame
        Returns the padded 'x', 'y', 'rank1', 'rank2', 'rank3', 'rank4', 'rank5' seriess.
        And additionally the 'exact_rank' and 'equivalent_rank' series if requested.
    
    """
    
    df = pd.concat(
        [
            df,
            pd.DataFrame(
                df[df.columns[0]].apply(lambda x: get_close_matches(x, dUnique_df["dUnique_label"].to_list(), n=5, cutoff=0.0)).to_list(),
                columns=['rank1', 'rank2', 'rank3', 'rank4', 'rank5'],
                index = df.index
            )
        ], 
        axis=1, 
        join="inner"
    )
    
    if find_related_rank:
        df.assign(lookup_rank= np.Inf, lookup_rank_related = np.Inf)
        for i in df.index:
            # Does any of them match
            lookup_clean = clean(df.at[i , df.columns[1]])
            match_rank = df.loc[i, ['rank1', 'rank2', 'rank3', 'rank4', 'rank5']].eq(lookup_clean).to_numpy().nonzero()


            lookup_rank = np.Inf    
            if len(match_rank[0]) > 0 :
                lookup_rank = match_rank[0][0] + 1
            df.loc[i, 'exact_rank'] = lookup_rank

            equivalent = np.setdiff1d(vocab[["latin_binomial_clean","common_name_clean"]][(vocab["latin_binomial_clean"] == lookup_clean) | (vocab["common_name_clean"] == lookup_clean)].unstack().unique(), lookup_rank)

            related_rank = np.Inf
            if len(equivalent) > 0:
                for lookup_result in equivalent:
                    annotated_rank = df.loc[i][['rank1', 'rank2', 'rank3', 'rank4', 'rank5']].eq(lookup_result).to_numpy().nonzero()
                    new_related_rank = np.Inf
                    if len(annotated_rank[0]) > 0: 
                        new_related_rank = annotated_rank[0][0] + 1
                        related_rank = min(related_rank, new_related_rank)

            #find related mappings to lookup value in predicted values 
            df.loc[i, 'equivalent_rank'] = min(lookup_rank, related_rank)
    return df

In [46]:
def find_Levenshtein_fuzzy_lookup_and_related_rank(df: pd.DataFrame, find_related_rank: bool = False, report_distances: bool = False) -> pd.DataFrame:
    """For each row in the input data frame, this function utilizes the Levenshtein distance to find the 
    top 5 unique natural product names that match the row's 'x' string value.
       
       
    Parameters
    ----------
    df: pd.DataFrame
        A pandas dataframe with the fist column containing 'x' strings to be matched against natural product names.
        Optionally containing Pandas Series with the clean encoded 'y' column.
    
    find_related_rank: bool
        A flag indicating wether to compare the top ranked results against the 'y' and it's potential equivalents or not.
        
    report_distances: bool
        A flag indicating return the distance values of the top ranked results against the 'y'.
      
    Returns
    -------
    df : pd.DataFrame
        Returns the padded 'x', 'y', 'rank1', 'rank2', 'rank3', 'rank4', 'rank5' series.
        And additionally the 'exact_rank' and 'equivalent_rank' series and the 'rank1_distance', 'rank2_distance', 
        'rank3_distance', 'rank4_distance', 'rank5_distance' if requested.
    
    """
    
    print("Using column: {} as input.".format(df.columns[0]))
    
    if find_related_rank:
        print("Using column: {} as target.".format(df.columns[1]))
        df.assign(exact_rank= np.Inf, equivalent_rank = np.Inf)
        
    if report_distances:
        df.assign(rank1_distance=np.Inf, rank2_distance=np.Inf, rank3_distance=np.Inf, rank4_distance=np.Inf, rank5_distance=np.Inf)


    for i in df.index:
        distances = dUnique_df["dUnique_label"].apply(lambda x: Levenshtein.distance(x, df.at[i, df.columns[0]])).to_numpy().astype('float32')
        argsort = distances.argsort()

        
        # Top-5 smalles distances
        df.loc[i, 'rank1'] = dUnique_df['dUnique_label'][argsort[0]]  
        df.loc[i, 'rank2'] = dUnique_df['dUnique_label'][argsort[1]]
        df.loc[i, 'rank3'] = dUnique_df['dUnique_label'][argsort[2]]
        df.loc[i, 'rank4'] = dUnique_df['dUnique_label'][argsort[3]]
        df.loc[i, 'rank5'] = dUnique_df['dUnique_label'][argsort[4]]

        if report_distances:
            df.loc[i, 'rank1_distance'] = distances[argsort[0]]  
            df.loc[i, 'rank2_distance'] = distances[argsort[1]]
            df.loc[i, 'rank3_distance'] = distances[argsort[2]]
            df.loc[i, 'rank4_distance'] = distances[argsort[3]]
            df.loc[i, 'rank5_distance'] = distances[argsort[4]]

        if find_related_rank:
            df.assign(lookup_rank= np.Inf, lookup_rank_related = np.Inf)
            for i in df.index:
                # Does any of them match
                lookup_clean = clean(df.at[i , df.columns[1]])
                match_rank = df.loc[i, ['rank1', 'rank2', 'rank3', 'rank4', 'rank5']].eq(lookup_clean).to_numpy().nonzero()


                lookup_rank = np.Inf    
                if len(match_rank[0]) > 0 :
                    lookup_rank = match_rank[0][0] + 1
                df.loc[i, 'exact_rank'] = lookup_rank

                equivalent = np.setdiff1d(vocab[["latin_binomial_clean","common_name_clean"]][(vocab["latin_binomial_clean"] == lookup_clean) | (vocab["common_name_clean"] == lookup_clean)].unstack().unique(), lookup_rank)

                related_rank = np.Inf
                if len(equivalent) > 0:
                    for lookup_result in equivalent:
                        annotated_rank = df.loc[i][['rank1', 'rank2', 'rank3', 'rank4', 'rank5']].eq(lookup_result).to_numpy().nonzero()
                        new_related_rank = np.Inf
                        if len(annotated_rank[0]) > 0: 
                            new_related_rank = annotated_rank[0][0] + 1
                            related_rank = min(related_rank, new_related_rank)

                #find related mappings to lookup value in predicted values 
                df.loc[i, 'equivalent_rank'] = min(lookup_rank, related_rank)

    return df

--------------------------------------

# Fuzzy Levenshtein Match MRR Evaluation

In [None]:
Levenshtein_match = find_Levenshtein_fuzzy_lookup_and_related_rank(positive_pairs.loc[:, ('x', 'y')], True, True)

In [76]:
Levenshtein_mrr = Levenshtein_match.loc[:,['x', 'exact_rank', 'equivalent_rank']]
Levenshtein_mrr.loc[:,'exact_reciprocal_rank'] = 1/Levenshtein_mrr.loc[:, 'exact_rank']
Levenshtein_mrr.loc[:,'equivalent_reciprocal_rank'] = 1/Levenshtein_mrr.loc[:, 'equivalent_rank']
Levenshtein_mrr[['exact_reciprocal_rank', 'equivalent_reciprocal_rank']].describe()

In [51]:
#get median and stdev
Levenshtein_mrr['exact_rank'].median(), Levenshtein_mrr['exact_rank'].std()

(inf, nan)

In [52]:
#get median and stdev
Levenshtein_mrr['equivalent_rank'].median(), Levenshtein_mrr['equivalent_rank'].std()

(1.0, nan)

In [53]:
1/Levenshtein_mrr.size  * Levenshtein_mrr['exact_reciprocal_rank'].sum()

0.08169666666666667

In [54]:
1/Levenshtein_mrr.size  * Levenshtein_mrr['equivalent_reciprocal_rank'].sum()

0.13212333333333334

--------------------------------------

# Fuzzy Gesalt Match MRR Evaluation

In [None]:
fuzzy_match = find_Gesalt_fuzzy_lookup_and_related_rank(positive_pairs.loc[:, ('x', 'y')], True)

In [55]:
fuzzy_mrr = fuzzy_match.loc[:,['x', 'exact_rank', 'equivalent_rank']]
fuzzy_mrr.loc[:,'exact_reciprocal_rank'] = 1/fuzzy_mrr.loc[:, 'exact_rank']
fuzzy_mrr.loc[:,'equivalent_reciprocal_rank'] = 1/fuzzy_mrr.loc[:, 'equivalent_rank']
fuzzy_mrr[['exact_reciprocal_rank', 'equivalent_reciprocal_rank']].describe()

Unnamed: 0,x,y,rank1,rank2,rank3,rank4,rank5,exact_rank,equivalent_rank
42226,CINNOMOR,CINNAMON,CINNAMON,CINNAMOMUM VERUM,CERINTHE MAJOR,VINCA MAJOR,CINNAMOMUM CAMPHORA,1.0,1.0
52199,FLAXEEDKTRIMJROSE OIL,FLAX SEED,EVENING PRIMROSE OIL,FLAX SEED,MATRICARIA CHAMOMILLA,CYCLAMEN HEDERIFOLIUM,BACTRIS MAJOR,2.0,2.0
58919,VALERIAN DROGPS,VALERIAN,VALERIAN,ALCEA ROSEA,VALERIANA OFFICINALIS,SALIX TRIANDRA,ARALIA RACEMOSA,1.0,1.0
61029,ELERBERRY SYU,SAMBUCUS NIGRA,ELDERBERRY,BERBERIS LYCIUM,BARBERRY,CRANBERRY,BERBERIS VULGARIS,inf,inf
1583,TURMRNIC,TURMERIC,TURMERIC,TEUCRIUM CRETICUM,QUERCUS MARILANDICA,PRUNUS ARMENIACA,TAMARINDUS INDICA,1.0,1.0


In [59]:
#get median and stdev
fuzzy_mrr['exact_rank'].median(), fuzzy_mrr['exact_rank'].std()

(inf, nan)

In [60]:
#get median and stdev
fuzzy_mrr['equivalent_rank'].median(), fuzzy_mrr['equivalent_rank'].std()

(1.0, nan)

In [61]:
1/fuzzy_mrr.size  * fuzzy_mrr['exact_reciprocal_rank'].sum()

0.08859000000000002

In [62]:
1/fuzzy_mrr.size  * fuzzy_mrr['equivalent_reciprocal_rank'].sum()

0.14407999999999999

--------------------------------------

# Translation tests

In [63]:
translation = '../data/translation_test_nps_202203171038.csv'
translation_set = preprocessInput(translation, MAXLEN)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5950 entries, 0 to 5949
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   np_name  5950 non-null   object
dtypes: object(1)
memory usage: 46.6+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['np_name'], dtype='object')
	New names: Index(['x'], dtype='object')
Dropping row with empty cells:
	Dropped 0 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 1 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 35 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5914 entries, 0 to 5913
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   x       5914 non-null   object
dtypes: o

In [64]:
translation_set.head()

Unnamed: 0,x
0,XTNNP
1,NWLMKV
2,WZMFCEA
3,AARONSROD
4,AARONS ROD WHOLE


In [65]:
encode_pad_tag(translation_set, 1, MAXLEN)

Encoding and Padding: ----------------------------------
	Processing x
Tagging: -----------------------------------------------


Unnamed: 0,x,Processed_x,Match
0,XTNNP,"[24, 20, 14, 14, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...",1
1,NWLMKV,"[14, 23, 12, 13, 11, 22, 0, 0, 0, 0, 0, 0, 0, ...",1
2,WZMFCEA,"[23, 26, 13, 6, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, ...",1
3,AARONSROD,"[1, 1, 18, 15, 14, 19, 18, 15, 4, 0, 0, 0, 0, ...",1
4,AARONS ROD WHOLE,"[1, 1, 18, 15, 14, 19, 27, 18, 15, 4, 27, 23, ...",1
...,...,...,...
5909,ZINGIBER ZINGIBER WHOLE,"[26, 9, 14, 7, 9, 2, 5, 18, 27, 26, 9, 14, 7, ...",1
5910,ZYGAENA ERYTHRAEA WHOLE,"[26, 25, 7, 1, 5, 14, 1, 27, 5, 18, 25, 20, 8,...",1
5911,ZYGAENA INDICA WHOLE,"[26, 25, 7, 1, 5, 14, 1, 27, 9, 14, 4, 9, 3, 1...",1
5912,ZYGAENA LEWINI WHOLE,"[26, 25, 7, 1, 5, 14, 1, 27, 12, 5, 23, 9, 14,...",1


In [66]:
novelty = find_ranks(model, translation_set.sample(n=10), False)

Using column:  x


In [67]:
novelty.head(10)

Unnamed: 0,x,Processed_x,Match,rank1,rank2,rank3,rank4,rank5
3026,HYDRASTIS TRIFOLIA WHOLE,"[8, 25, 4, 18, 1, 19, 20, 9, 19, 27, 20, 18, 9...",1,HORNY GOAT WEED,SENNA,GOLDENSEAL,HORSETAIL,CORDYCEPS
1576,CINNAMOMUM BURMANNI,"[3, 9, 14, 14, 1, 13, 15, 13, 21, 13, 27, 2, 2...",1,LEPIDIUM MEYENII,MAMMEA AMERICANA,CINNAMOMUM BURMANNI,PAULLINIA CUPANA,BIXA ORELLANA
4402,PULMONARIA VIRGINICA WHOLE,"[16, 21, 12, 13, 15, 14, 1, 18, 9, 1, 27, 22, ...",1,BLACK PEPPER,BALLOTA NIGRA,COLEUS,LICORICE,PLUCHEA INDICA
2372,EUROPEAN BLACK ELDER WHOLE,"[5, 21, 18, 15, 16, 5, 1, 14, 27, 2, 12, 1, 3,...",1,HORNY GOAT WEED,HORSETAIL,GUGGUL,COLEUS,ELDERBERRY
2472,FIELD BALM WHOLE,"[6, 9, 5, 12, 4, 27, 2, 1, 12, 13, 27, 23, 8, ...",1,HORSETAIL,HORNY GOAT WEED,COLEUS,CORDYCEPS,COPTIS
587,BEI CHAI HU,"[2, 5, 9, 27, 3, 8, 1, 9, 27, 8, 21, 0, 0, 0, ...",1,BERBERIS LYCIUM,ARCTIUM LAPPA,BLACK CUMIN,NIU BANG ZI,TANGKUEI
5169,STACHYS BALLOTA WHOLE,"[19, 20, 1, 3, 8, 25, 19, 27, 2, 1, 12, 12, 15...",1,SENNA,STACHYS BULLATA,CORDYCEPSMILITARIS,EQUISETUM ARVENSE,BERBERIS AQUIFOLIUM
5344,THEAPHYLLA ANAMENSIS WHOLE,"[20, 8, 5, 1, 16, 8, 25, 12, 12, 1, 27, 1, 14,...",1,CORDYCEPS,HORNY GOAT WEED,COPTIS,GOJI BERRY,TYPHA DOMINGENSIS
5466,TURNERA DIFFUSA WHOLE,"[20, 21, 18, 14, 5, 18, 1, 27, 4, 9, 6, 6, 21,...",1,BARBERRY,TURNERA DIFFUSA,COPTIS,CRANBERRY,VIBURNUM OPULUS
2016,CYPERUS HYDRA WHOLE,"[3, 25, 16, 5, 18, 21, 19, 27, 8, 25, 4, 18, 1...",1,BLACK PEPPER,COLEUS,HORNY GOAT WEED,CORDYCEPS,BARBERRY


In [68]:
# test["Processed_np_name"] = test.np_name.apply(clean).apply(encode).apply(padding)

In [69]:
# test = preprocessInput(unmapped, converters = {"drug_name_original":str}, skip_blank_lines=True, na_filter=True, na_values="")
# test["Processed_drug_name_original"] = test[test.columns[0]].apply(clean).apply(encode).apply(padding)