# Imports & Globals

In [1]:
import numpy as np
import pandas as pd
import pickle
from yaml import safe_load
from difflib import get_close_matches

import re
import string
from typing import Literal

#import matplotlib.pyplot as plt
#import seaborn as sns
# %matplotlib inline

In [2]:
import tensorflow as tf
#@title Versions:
print("tf.version: ", tf.version.VERSION)
print("tf.keras.version: ", tf.keras.__version__)

tf.version:  2.9.1
tf.keras.version:  2.9.0


In [3]:
# from tensorflow.python.ops.numpy_ops import np_config
# np_config.enable_numpy_behavior()
# tf.enable_eager_execution()

In [4]:
# Check that GPU is available: cf. https://colab.research.google.com/notebooks/gpu.ipynb
# assert(tf.test.gpu_device_name())

# tf.keras.backend.clear_session()
# tf.config.optimizer.set_jit(True) # Enable XLA.

2022-08-18 16:45:04.695924: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-18 16:45:04.750548: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-18 16:45:04.779436: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-18 16:45:04.779584: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so retur

In [5]:
tf.executing_eagerly()

True

In [6]:
with open('LSTM65.yaml', 'r') as file:
    model_config = safe_load(file)

In [7]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = model_config['MAXLEN']

In [8]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

## Data Processing Functions

In [9]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character or space.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character or space.
    
    In order to retain the length of the sequence compliant with the maximum sequence length,
    additional processing has been added such that sequences that reach the maximum sequence length
    can only be modified by removing or swapping characters.
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
#     print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        if len(w) < MAXLEN:
            # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
            if r <= 0.3333: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            elif r<= 0.6667: # delete
                w = w[:p] + w[p+1:]
            else: # add
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p:]
        else:
            if r <= 0.5: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            else: # delete
                w = w[:p] + w[p+1:]
            
    return w

In [10]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [11]:
def encode(x: string) -> [int]:
    '''Applies the encoding function to a given value.
    
    Returns these string enconded into an array containing an integer mapping to each character and space (1-66) separately.
    
    Parameters
    ----------
    x : string
        
      
    Returns
    -------
    x : [int]
        Returns the encoded string.

    '''
    return list(map(encode_dict.get, x))

In [12]:
def padding(x: [int], maxlen: int = MAXLEN) -> [int]:
    '''Applies the padding function to the encoded sequence.
    
    Returns the enconded Series padded.
    
    Parameters
    ----------
    x : [int]
       Encoded character sequence.
      
    Returns
    -------
    x : [int]
        Returns the padded encoded character sequence.
    '''
    return x + ([0]* (maxlen-len(x)))

In [13]:
def preprocessInput(filename: str, maxlen: int = MAXLEN, **kwargs) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    It then removes the sequences with length greater than the maximun sequence length, cleans the sequences and
    uppercases them, and it finally drops any duplicates that might have arrisen from this processing.
    Returns a Pandas Dataframe containing unique cleaned and uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename)
    print(df.info())
    
    print("Processing file: ----------------------------------------")
    
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen:")
    for column in df.columns:
        df.drop(df[df[column].apply(len).gt(maxlen)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    
    print("\tCleaning string sequences:")
    df = df.applymap(clean)
    
    print("\tUppercasing string sequences:")
    df = df.applymap(lambda x: str.upper(x))
    
    print("Dropping duplicate sequences:")
    original_count = len(df.index)
    df.drop_duplicates(ignore_index=True, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "duplicate sequences.")
    
    print("Done processing: ---------------------------------------")
    print(df.info())
    return df

In [14]:
def encode_pad_tag(df: pd.DataFrame, match: Literal[0,1], maxlen: int = MAXLEN) -> pd.DataFrame:
    '''It encodes, pads and tags the preprocessed sequences in a Pandas DataFrame.
    
    Expects a pandas dataframe with cleaned and uppercased sequences. It processes the 
    the DataFrame by creating an additional 'Processed_' + current column name for each
    of the columns in the data frame, where each of the sequences in the column get 
    transformed from a string sequence to an encoded sequence and then transformed again 
    by padding the encoded sequences up to the maximun sequence length by 0's as needed. 
    Finally, this function returns this dataframe with both the original and processed columns.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing sequences.
    
    match: Literal[0 | 1]
        Tag indicating wether sequences match 1 indicates 'yes' and 0 indicates 'no'. 
        
    maxlen: int
        Dafault to global MAXLEN value. It's used to determine the ammount of padding to add
        to sequences smaller than maxlen.
        
    Returns
    -------
    df : Pandas DataFrame
        A copy of the origininal DataFrame with the processed sequences added as new columns.
    '''  
    print("Encoding and Padding: ----------------------------------")
    for column in df.columns:
        print("\tProcessing {}".format(column))
        df["Processed_" + column] = df[column].apply(lambda string: list(map(encode_dict.get, string))).transform(lambda x: x + ([0]* (maxlen-len(x))))
    print("Tagging: -----------------------------------------------")
    df["Match"] = match
    return df

------------------------------------------------

# Data loading and preprocessing

# Pickled Datasets

In [15]:
dUnique_df = pd.read_pickle("../data/dUnique_df.pkl")
dfneg2 = pd.read_pickle("../data/dfneg2.pkl")
test = pd.read_pickle("../data/test.pkl")
validate = pd.read_pickle("../data/validate.pkl")

--------------------------------

# Build model, load weights and evaluate on test data

In [16]:
class CosineSimilarity(tf.keras.layers.Layer):
    '''Cosine similarity to be calculated as sum(x*y)/(sqrt(sum(x))*sqrt(sum(y))).
    This is achieved through Tensorflow functions to retain performance.
    
    Parameters
    ----------
    vects: tf.TensorArray
    
    Returns
    -------
    cosine_similarity: tf.TensorArray
       The result of the cosine similarity between the vectors.    
    '''
    __name__ = 'CosineSimilarity'
    def __init__(self, **kwargs):
        super(CosineSimilarity, self).__init__()
       
    @tf.function  # The decorator converts `cosine_similarity` into a tensolflow `Function`.
    def call(self, vects: tf.TensorArray) -> tf.TensorArray:
        x, y = vects
        return tf.math.divide(tf.reduce_sum(tf.multiply(x,y), axis=1, keepdims=True), tf.multiply(tf.norm(x, ord=2, axis=1, keepdims=True), tf.norm(y, ord=2, axis=1, keepdims=True)))

    def get_config(self):
        return super(CosineSimilarity, self).get_config()
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [17]:
class ContrastiveLoss(tf.keras.losses.Loss):
    '''Returns a value between 0 and 1 representing the average error of the y_pred vector by comparing it to the y_true.
    '''
    __name__ = 'ContrastiveLoss'
    def __init__(self, margin: tf.float32 = 1.0, **kwargs):
        super(ContrastiveLoss, self).__init__()
        self.margin = tf.constant(margin)
        
    @tf.function  # The decorator converts `loss` into a tensolflow `Function`.
    def call(self, y_true: tf.TensorArray, y_pred: tf.TensorArray) -> tf.Tensor:
        return tf.math.reduce_mean((1 - y_true) * tf.math.square(y_pred) + (y_true) * tf.math.square(tf.math.maximum(self.margin - (y_pred), 0.0)), axis = -1)
    
    def get_config(self):
        config = super(ContrastiveLoss, self).get_config()
        config.update({
            "margin": str(self.margin)
        })
        return config
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [18]:
model = tf.keras.models.load_model(
    './saved_models/{}'.format(model_config["Model_Name"]), 
    custom_objects = {
        'CosineSimilarity': CosineSimilarity,
        'ContrastiveLoss': ContrastiveLoss
    }, 
    compile=True, 
    options=None
)
model.summary()

2022-08-18 16:45:05.455536: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-18 16:45:05.455726: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-18 16:45:05.455835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-18 16:45:05.456132: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-18 16:45:05.456242: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

2022-08-18 16:45:11.267111: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-18 16:45:11.470942: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-18 16:45:11.481161: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-18 16:45:11.543066: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-08-18 16:45:11.552961: W tensorflow/core/common_runtime/graph_constructor.cc:805] Node 'cond' has 5 outputs but the _output_shapes at

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 65)]         0           []                               
                                                                                                  
 model (Functional)             (None, 100)          109220      ['input_2[0][0]',                
                                                                  'input_3[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 1)            0           ['model[0][0]',            

---------------------------------------------------------------

# Evaluation

## Unique target labels

In [19]:
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq_padded
0,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,BUTCHERSBROOM,"[2, 21, 20, 3, 8, 5, 18, 19, 2, 18, 15, 15, 13..."
2,CATSCLAW,"[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,..."
3,CINNAMON,"[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0..."
4,FENUGREEK,"[6, 5, 14, 21, 7, 18, 5, 5, 11, 0, 0, 0, 0, 0,..."


In [20]:
dUnique_df.size

252

## True Positives

In [21]:
fName = '../data/NP_FAERS_mapped_20220215.csv'
matches = preprocessInput(fName)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ----------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 374 that exceeded the maximum sequence length.
	Cleaning string sequences:
	Uppercasing string sequences:
Dropping duplicate sequences:
	Dropped 482 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  4502 non-null   object
 1   lookup_value      4502 non-null   object
dtypes: object(2)
memory usage: 70.5+

In [22]:
matches["Processed_FAERS_drug_match"] = matches.FAERS_drug_match.apply(clean).apply(encode).apply(padding)
matches["Processed_lookup_value"] = matches.lookup_value.apply(clean).apply(encode).apply(padding)

In [23]:
matches.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."


### ASHWAGANDA

In [24]:
matches.loc[1, "FAERS_drug_match"]

'ASHWAGANDHA EXTRACT'

In [25]:
predicts = model.predict([np.tile(matches.loc[1, "Processed_FAERS_drug_match"], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()


If you want XLA:CPU, do one of the following:

 - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
 - set cpu_global_jit to true on this session's OptimizerOptions, or
 - use experimental_jit_scope, or
 - use tf.function(jit_compile=True).

To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
proper command-line flag, not via TF_XLA_FLAGS).
2022-08-18 16:45:19.479699: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x55ed53c3c730 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2022-08-18 16:45:19.479721: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA GeForce RTX 2070 with Max-Q Design, Compute Capability 7.5
2022-08-18 16:45:19.483385: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2022-08-18 16:45:19.544323: I tensorflow/core/platform/default/subpro



In [26]:
# Top-5 smalles distances
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ASHWAGANDA',
 'rank2': 'CHELIDONIUM MAJUS',
 'rank3': 'LIONS TOOTH',
 'rank4': 'SWALLOWWORT',
 'rank5': 'WITHANIA SOMNIFERA'}

### Echinacea

In [27]:
Echinacea = dfneg2[dfneg2["FAERS_drug_match"].str.contains("ECHINACEA")].head(1)
Echinacea

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
393,ECHINACEA TEA ECHINACEA PURPUREA,GREEN TEA,"[5, 3, 8, 9, 14, 1, 3, 5, 1, 27, 20, 5, 1, 27,...","[7, 18, 5, 5, 14, 27, 20, 5, 1, 0, 0, 0, 0, 0,...",0


In [28]:
i = Echinacea.index.values[0]
clean(dfneg2["FAERS_drug_match"][i])

'ECHINACEA TEA ECHINACEA PURPUREA'

In [29]:
predicts = model.predict([np.tile(padding(encode(clean(dfneg2["FAERS_drug_match"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()



In [30]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'ECHINACEA PURPUREA',
 'rank2': 'ECHINACEA',
 'rank3': 'UNCARIA TOMENTOSA',
 'rank4': 'CATSCLAW',
 'rank5': 'VACCINIUM MACROCARPON'}

### Cranberry

In [31]:
cranberry = dUnique_df[dUnique_df["dUnique_label"].str.contains("CRANBERRY")].head(1)
cranberry

Unnamed: 0,dUnique_label,dUnique_seq_padded
58,CRANBERRY,"[3, 18, 1, 14, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0..."


In [32]:
i = cranberry.index.values[0]
clean(dUnique_df["dUnique_label"][i])

'CRANBERRY'

In [33]:
predicts = model.predict([np.tile(padding(encode(clean(dUnique_df["dUnique_label"][i]))), (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
top5 = predicts.flatten().argsort()
# argsort = predicts.flatten().argsort()



In [34]:
# Top-5 smalles distances
m = max(top5)
{
    "rank1": dUnique_df.iloc[top5[-1]]['dUnique_label'],
    "rank2": dUnique_df.iloc[top5[-2]]['dUnique_label'],
    "rank3": dUnique_df.iloc[top5[-3]]['dUnique_label'],
    "rank4": dUnique_df.iloc[top5[-4]]['dUnique_label'],
    "rank5": dUnique_df.iloc[top5[-5]]['dUnique_label'],
}

{'rank1': 'CRANBERRY',
 'rank2': 'VACCINIUM MACROCARPON',
 'rank3': 'STEVIA REBAUDIANA',
 'rank4': 'ELDERBERRY',
 'rank5': 'HORSETAIL'}

## True  Negatives

In [35]:
dfneg2.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0


# Evaluating on test data - NP names only

In [36]:
vocab = pd.read_csv('../data/lb_to_common_names.csv')
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 958 entries, 0 to 957
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   latin_binomial        958 non-null    object
 1   common_name           958 non-null    object
 2   latin_binomial_clean  958 non-null    object
 3   common_name_clean     958 non-null    object
dtypes: object(4)
memory usage: 30.1+ KB


In [37]:
test = test.assign(rank1_drug="", rank2_drug="", rank3_drug="", rank4_drug="", rank5_drug="", rank1_distance=np.Inf, rank2_distance=np.Inf, rank3_distance=np.Inf, rank4_distance=np.Inf, rank5_distance=np.Inf, lookup_rank= np.Inf, lookup_rank_related = np.Inf)
test.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance,lookup_rank,lookup_rank_related
30872,CINEMMN,CINNAMON,"[3, 9, 14, 5, 13, 13, 14, 0, 0, 0, 0, 0, 0, 0,...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
34223,CINNA ON CJINNBAMOMUM HASSIA BARK,CINNAMON,"[3, 9, 14, 14, 1, 27, 15, 14, 27, 3, 10, 9, 14...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
39275,TEA AS RUG,GREEN TEA,"[20, 5, 1, 27, 1, 19, 27, 18, 21, 7, 0, 0, 0, ...","[7, 18, 5, 5, 14, 27, 20, 5, 1, 0, 0, 0, 0, 0,...",1,,,,,,inf,inf,inf,inf,inf,inf,inf
20146,GREEN TEA CAPS,WHEAT GRASS,"[7, 18, 5, 5, 14, 27, 20, 5, 1, 27, 3, 1, 16, ...","[23, 8, 5, 1, 20, 27, 7, 18, 1, 19, 19, 0, 0, ...",0,,,,,,inf,inf,inf,inf,inf,inf,inf
22982,MILKTHISTLE,TANGKUEI,"[13, 9, 12, 11, 20, 8, 9, 19, 20, 12, 5, 0, 0,...","[20, 1, 14, 7, 11, 21, 5, 9, 0, 0, 0, 0, 0, 0,...",0,,,,,,inf,inf,inf,inf,inf,inf,inf


# Evaluation of drug name predictions
### Find ranks 1-n from the predicted similarities for the test data

In [38]:
np.setdiff1d(vocab[["latin_binomial_clean","common_name_clean"]][(vocab["latin_binomial_clean"] == "TURMERIC") | (vocab["common_name_clean"] == "TURMERIC")].unstack().unique(), "TURMERIC")

array(['CURCUMA LONGA'], dtype=object)

In [39]:
def find_ranks(model: tf.keras.Model, df: pd.DataFrame, find_related_rank: bool = False) -> pd.DataFrame:
    """For each row in the input dataframe, the model is used to predict the top matching Unique Product Names 'FAERS_drug_match' against the  
    entry matches any of the 'lookup_value' entries.
       This is done at the encoded sequence level for both name all unique drugnames
       
         Parameters
    ----------
    model : tf.keras.Model
        A Keras model based Siamese Network that takes three inputs. 
        Namely, two input sequeces and a third input binary target specifying wether the two sequeces match.
    y : pd.Series
        A pandas Series containing the clean encoded 'lookup_value' column.
      
    Returns
    -------
    x : pd.Series
        Returns the padded 'FAERS_drug_match' series.
    y : pd.Series 
        Returns the padded 'lookup_value' series.
    
    """
    
    print("Using column: ", df.columns[0])
    for i in df.index:
        predicts = model.predict([np.tile(df.loc[i, "Processed_"+ df.columns[0]], (dUnique_df['dUnique_seq_padded'].shape[0],1)), np.stack(dUnique_df['dUnique_seq_padded'])]).astype('float32')
        argsort = predicts.flatten().argsort()
        # Top-5 smalles distances
        df.at[i, 'rank1_drug'] = dUnique_df['dUnique_label'][argsort[-1]]  
        df.at[i, 'rank2_drug'] = dUnique_df['dUnique_label'][argsort[-2]]
        df.at[i, 'rank3_drug'] = dUnique_df['dUnique_label'][argsort[-3]]
        df.at[i, 'rank4_drug'] = dUnique_df['dUnique_label'][argsort[-4]]
        df.at[i, 'rank5_drug'] = dUnique_df['dUnique_label'][argsort[-5]]
        df.at[i, 'rank1_distance'] = predicts[argsort[-1]]  
        df.at[i, 'rank2_distance'] = predicts[argsort[-2]]
        df.at[i, 'rank3_distance'] = predicts[argsort[-3]]
        df.at[i, 'rank4_distance'] = predicts[argsort[-4]]
        df.at[i, 'rank5_distance'] = predicts[argsort[-5]]
        if find_related_rank:
            # Find the top-5 predicted matches
            lookup_clean = clean(df.at[i , 'lookup_value'])
            predicted_rank = df.loc[i, ['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_clean).to_numpy().nonzero()
            
            # Find the top ranking correct match, if not rank is infinity so that 1/inf ~ 0, for the MRR computation.
            lookup_rank = np.Inf    
            if len(predicted_rank[0]) > 0 :
                lookup_rank = predicted_rank[0][0] + 1
            df.loc[i, 'lookup_rank'] = lookup_rank

            # Find all the equivalent common names and latin binomials relative to the look up value that would be equaly correct.
            equivalent = np.setdiff1d(vocab[["latin_binomial_clean","common_name_clean"]][(vocab["latin_binomial_clean"] == lookup_clean) | (vocab["common_name_clean"] == lookup_clean)].unstack().unique(), lookup_rank)

            # Find the top ranking correct match 
            related_rank = np.Inf
            if len(equivalent) > 0:
                for lookup_result in equivalent:
                    annotated_rank = df.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_result).to_numpy().nonzero()
                    if len(annotated_rank[0]) > 0: 
                        new_related_rank = annotated_rank[0][0] + 1
                        related_rank = min(related_rank, new_related_rank)

            #find related mappings to lookup value in predicted values 
            df.loc[i, 'lookup_rank_related'] = min(lookup_rank, related_rank)
            
    return df

# Add related mappings rank to test set evaluation

In [40]:
positive_pairs = test[test["Match"] == 1]

## Assing ranks to the matching 
matches are assigned their corresponding rank
non-matches are left null

In [41]:
predicted = find_ranks(model, positive_pairs,  True)

Using column:  FAERS_drug_match




















































----------------------------------------------------

# Predicted Match MRR Evaluation

In [42]:
models_mrr = predicted[['FAERS_drug_match', 'lookup_rank', 'lookup_rank_related']]

In [43]:
models_mrr['exact_reciprocal_rank'] = 1/models_mrr.loc[:, 'lookup_rank']
models_mrr['equivalent_reciprocal_rank'] = 1/models_mrr.loc[:, 'lookup_rank_related']
models_mrr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  models_mrr['exact_reciprocal_rank'] = 1/models_mrr.loc[:, 'lookup_rank']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  models_mrr['equivalent_reciprocal_rank'] = 1/models_mrr.loc[:, 'lookup_rank_related']


Unnamed: 0,FAERS_drug_match,lookup_rank,lookup_rank_related,exact_reciprocal_rank,equivalent_reciprocal_rank
30872,CINEMMN,2.0,1.0,0.5,1.0
34223,CINNA ON CJINNBAMOMUM HASSIA BARK,1.0,1.0,1.0,1.0
39275,TEA AS RUG,2.0,1.0,0.5,1.0
25366,WLYAX FLAX SEED OIL,2.0,1.0,0.5,1.0
28284,DANDELHION ROOT TA,3.0,3.0,0.333333,0.333333


In [44]:
models_mrr[['lookup_rank', 'lookup_rank_related', 'exact_reciprocal_rank', 'equivalent_reciprocal_rank']].describe()

Unnamed: 0,lookup_rank,lookup_rank_related,exact_reciprocal_rank,equivalent_reciprocal_rank
count,4094.0,4094.0,4094.0,4094.0
mean,inf,inf,0.70631,0.92991
std,,,0.282832,0.196632
min,1.0,1.0,0.0,0.0
25%,1.0,1.0,0.5,1.0
50%,2.0,1.0,0.5,1.0
75%,2.0,1.0,1.0,1.0
max,inf,inf,1.0,1.0


In [45]:
#get median and stdev
models_mrr['lookup_rank'].median(), models_mrr['lookup_rank'].std()

(2.0, nan)

In [46]:
#get median and stdev
models_mrr['lookup_rank_related'].median(), models_mrr['lookup_rank_related'].std()

(1.0, nan)

In [47]:
1/models_mrr.size  * models_mrr['exact_reciprocal_rank'].sum()

0.14126200944471584

In [48]:
1/models_mrr.size  * models_mrr['equivalent_reciprocal_rank'].sum()

0.1859819247679531

--------------------------------------

# Comparison with fuzzy string match

In [66]:
def find_fuzzy_lookup_and_related_rank(df: pd.DataFrame, find_related_rank: bool = False) -> pd.DataFrame:
    """For each row in the input data frame, this function utilizes the difflib implementation of fuzzy string match
       to find the top 5 unique natural product names that match the row's 'FAERS_drug_match' string value.
       
       
    Parameters
    ----------
    df: pd.DataFrame
        A pandas dataframe with the fist column containing 'FAERS_drug_match' strings to be matched against natural product names.
        Optionally containing Pandas Series with the clean encoded 'lookup_value' column.
    
    find_related_rank: bool
        A flag indicating wether to compare the top ranked results against the 'lookup_value' and it's potential equivalents or not.
      
    Returns
    -------
    df : pd.DataFrame
        Returns the padded 'FAERS_drug_match', 'lookup_value', 'rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug' seriess.
        And additionally the 'lookup_rank' and 'lookup_rank_related' series if requested.
    
    """
    
    df = pd.concat(
        [
            df,
            pd.DataFrame(
                df[df.columns[0]].apply(lambda x: get_close_matches(x, dUnique_df["dUnique_label"].to_list(), n=5, cutoff=0.0)).to_list(),
                columns=['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug'],
                index = df.index
            )
        ], 
        axis=1, 
        join="inner"
    )
    df.assign(lookup_rank= np.Inf, lookup_rank_related = np.Inf)
    if find_related_rank:
        for i in df.index:
            # Does any of them match
            lookup_clean = clean(df.at[i , 'lookup_value'])
            match_rank = df.loc[i, ['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_clean).to_numpy().nonzero()


            lookup_rank = np.Inf    
            if len(match_rank[0]) > 0 :
                lookup_rank = match_rank[0][0] + 1
            df.loc[i, 'lookup_rank'] = lookup_rank

            equivalent = np.setdiff1d(vocab[["latin_binomial_clean","common_name_clean"]][(vocab["latin_binomial_clean"] == lookup_clean) | (vocab["common_name_clean"] == lookup_clean)].unstack().unique(), lookup_rank)

            related_rank = np.Inf
            if len(equivalent) > 0:
                for lookup_result in equivalent:
                    annotated_rank = df.loc[i][['rank1_drug', 'rank2_drug', 'rank3_drug', 'rank4_drug', 'rank5_drug']].eq(lookup_result).to_numpy().nonzero()
                    new_related_rank = np.Inf
                    if len(annotated_rank[0]) > 0: 
                        new_related_rank = annotated_rank[0][0] + 1
                        related_rank = min(related_rank, new_related_rank)

            #find related mappings to lookup value in predicted values 
            df.loc[i, 'lookup_rank_related'] = min(lookup_rank, related_rank)
    return df

In [67]:
fuzzy_match = find_fuzzy_lookup_and_related_rank(positive_pairs[["FAERS_drug_match", 'lookup_value']])

In [68]:
fuzzy_match.head(10)

Unnamed: 0,FAERS_drug_match,lookup_value,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug
30872,CINEMMN,CINNAMON,CINNAMON,ECHINACEA,GARCINIA GUMMI,GINGER,VACCINIUM MACROCARPON
34223,CINNA ON CJINNBAMOMUM HASSIA BARK,CINNAMON,CINNAMOMUM VERUM,CHELIDONIUM MAJUS,VALERIANA OFFICINALIS,VACCINIUM MACROCARPON,MATRICARIA CHAMOMILLA
39275,TEA AS RUG,GREEN TEA,WHEAT GRASS,STEVIA REBAUDIANA,BETA VULGARIS,RED YEAST RICE,ACTAEA RACEMOSA
25366,WLYAX FLAX SEED OIL,FLAX SEED,FLAX SEED,BOSWELLIA SERRATA,WITHANIA SOMNIFERA,NIGELLA SATIVA,TARAXACUM OFFICINALE
28284,DANDELHION ROOT TA,TARAXACUM OFFICINALE,LIONS TOOTH,BEET ROOT,ALOE VERA,RHODIOLA ROSEA,CHELIDONIUM MAJUS
35720,CVS MILK THIVTLE,MILK THISTLE,MILK THISTLE,NIGELLA SATIVA,CHAMOMILE,STINGING NETTLE,CANNABIS SATIVA
25290,WALGREENSSBRAOD CCINNAMON CAPNSULES,CINNAMON,VALERIANA OFFICINALIS,VACCINIUM MACROCARPON,CINNAMON,ZINGIBER OFFICINALE,TARAXACUM OFFICINALE
30627,REDSHIMAX,REISHI,REISHI,RHODIOLA,ECHINACEA,STEVIA,HEDERA HELIX
475,ZINGIBER OFFICINALEZINGIBER OFFICINALE RHIZOME,GINGER,ZINGIBER OFFICINALE,VALERIANA OFFICINALIS,TARAXACUM OFFICINALE,MORINGA OLEIFERA,PAUSINYSTALIA JOHIMBE
37317,GYMNEM MULORRY COMPLEX,LEPIDIUM MEYENII,GYMNEMA SYLVESTRE,MATRICARIA CHAMOMILLA,MORINGA OLEIFERA,TRIGONELFA FOENUM,EQUISETUM HYEMALE


--------------------------------------

# Fuzzy Match MRR Evaluation

In [52]:
fuzzy_mrr = fuzzy_match[['FAERS_drug_match', 'lookup_rank', 'lookup_rank_related']]

In [53]:
fuzzy_mrr['exact_reciprocal_rank'] = 1/fuzzy_mrr.loc[:, 'lookup_rank']
fuzzy_mrr['equivalent_reciprocal_rank'] = 1/fuzzy_mrr.loc[:, 'lookup_rank_related']
fuzzy_mrr.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fuzzy_mrr['exact_reciprocal_rank'] = 1/fuzzy_mrr.loc[:, 'lookup_rank']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fuzzy_mrr['equivalent_reciprocal_rank'] = 1/fuzzy_mrr.loc[:, 'lookup_rank_related']


Unnamed: 0,FAERS_drug_match,lookup_rank,lookup_rank_related,exact_reciprocal_rank,equivalent_reciprocal_rank
30872,CINEMMN,1.0,1.0,1.0,1.0
34223,CINNA ON CJINNBAMOMUM HASSIA BARK,inf,1.0,0.0,1.0
39275,TEA AS RUG,inf,inf,0.0,0.0
25366,WLYAX FLAX SEED OIL,1.0,1.0,1.0,1.0
28284,DANDELHION ROOT TA,inf,inf,0.0,0.0


In [54]:
fuzzy_mrr[['lookup_rank', 'lookup_rank_related', 'exact_reciprocal_rank', 'equivalent_reciprocal_rank']].describe()

  diff_b_a = subtract(b, a)


Unnamed: 0,lookup_rank,lookup_rank_related,exact_reciprocal_rank,equivalent_reciprocal_rank
count,4094.0,4094.0,4094.0,4094.0
mean,inf,inf,0.44459,0.807035
std,,,0.466071,0.368357
min,1.0,1.0,0.0,0.0
25%,1.0,1.0,0.0,1.0
50%,4.0,1.0,0.25,1.0
75%,,1.0,1.0,1.0
max,inf,inf,1.0,1.0


In [55]:
#get median and stdev
fuzzy_mrr['lookup_rank'].median(), fuzzy_mrr['lookup_rank'].std()

(4.0, nan)

In [56]:
#get median and stdev
fuzzy_mrr['lookup_rank_related'].median(), fuzzy_mrr['lookup_rank_related'].std()

(1.0, nan)

In [57]:
1/fuzzy_mrr.size  * fuzzy_mrr['exact_reciprocal_rank'].sum()

0.0889179286761114

In [58]:
1/fuzzy_mrr.size  * fuzzy_mrr['equivalent_reciprocal_rank'].sum()

0.16140693698094774

--------------------------------------

# Translation tests

In [59]:
translation = '../data/translation_test_nps_202203171038.csv'
translation_set = preprocessInput(translation)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5950 entries, 0 to 5949
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   np_name  5950 non-null   object
dtypes: object(1)
memory usage: 46.6+ KB
None
Processing file: ----------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 1 that exceeded the maximum sequence length.
	Cleaning string sequences:
	Uppercasing string sequences:
Dropping duplicate sequences:
	Dropped 35 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5914 entries, 0 to 5913
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   np_name  5914 non-null   object
dtypes: object(1)
memory usage: 46.3+ KB
None


In [60]:
translation_set.head()

Unnamed: 0,np_name
0,XTNNP
1,NWLMKV
2,WZMFCEA
3,AARONSROD
4,AARONS ROD WHOLE


In [61]:
encode_pad_tag(translation_set, 1, MAXLEN)

Encoding and Padding: ----------------------------------
	Processing np_name
Tagging: -----------------------------------------------


Unnamed: 0,np_name,Processed_np_name,Match
0,XTNNP,"[24, 20, 14, 14, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0...",1
1,NWLMKV,"[14, 23, 12, 13, 11, 22, 0, 0, 0, 0, 0, 0, 0, ...",1
2,WZMFCEA,"[23, 26, 13, 6, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, ...",1
3,AARONSROD,"[1, 1, 18, 15, 14, 19, 18, 15, 4, 0, 0, 0, 0, ...",1
4,AARONS ROD WHOLE,"[1, 1, 18, 15, 14, 19, 27, 18, 15, 4, 27, 23, ...",1
...,...,...,...
5909,ZINGIBER ZINGIBER WHOLE,"[26, 9, 14, 7, 9, 2, 5, 18, 27, 26, 9, 14, 7, ...",1
5910,ZYGAENA ERYTHRAEA WHOLE,"[26, 25, 7, 1, 5, 14, 1, 27, 5, 18, 25, 20, 8,...",1
5911,ZYGAENA INDICA WHOLE,"[26, 25, 7, 1, 5, 14, 1, 27, 9, 14, 4, 9, 3, 1...",1
5912,ZYGAENA LEWINI WHOLE,"[26, 25, 7, 1, 5, 14, 1, 27, 12, 5, 23, 9, 14,...",1


In [62]:
novelty = find_ranks(model, translation_set.sample(n=100), False)

Using column:  np_name


In [63]:
novelty.head(10)

Unnamed: 0,np_name,Processed_np_name,Match,rank1_drug,rank2_drug,rank3_drug,rank4_drug,rank5_drug,rank1_distance,rank2_distance,rank3_distance,rank4_distance,rank5_distance
4210,PINUS PALUSTRIS WHOLE,"[16, 9, 14, 21, 19, 27, 16, 1, 12, 21, 19, 20,...",1,MIRACLEFRUIT,MACA,LEPIDIUM MEYENII,GYMNEMA SYLVESTRE,MILK THISTLE,0.963257,0.923218,0.875843,0.866418,0.579263
3891,NIMBA WHOLE,"[14, 9, 13, 2, 1, 27, 23, 8, 15, 12, 5, 0, 0, ...",1,MILK THISTLE,SILYBUM MARIANUM,BLACK CUMIN,UNCARIA TOMENTOSA,KARCURA,1.0,0.996491,0.643737,0.551254,0.42515
2993,HORDEUM TANAITICUM WHOLE,"[8, 15, 18, 4, 5, 21, 13, 27, 20, 1, 14, 1, 9,...",1,CHAMOMILE,HOREHOUND,MARRUBIUM VULGARE,WOOD SPIDER,HEMP EXTRACT,1.0,0.737173,0.705438,0.64901,0.623964
3141,JUGLANS ORIENTIS WHOLE,"[10, 21, 7, 12, 1, 14, 19, 27, 15, 18, 9, 5, 1...",1,LINUM USITATISSIMUM,NIGELLA SATIVA,WITHANIA SOMNIFERA,BLACK CUMIN,BOSWELLIA,0.726169,0.671061,0.661382,0.64498,0.619716
5027,SLENDER NETTLE WHOLE,"[19, 12, 5, 14, 4, 5, 18, 27, 14, 5, 20, 20, 1...",1,MILK THISTLE,SILYBUM MARIANUM,SAMBUCUS NIGRA,SERENOA REPENS,SCRUBPALMETTO,0.713317,0.648403,0.5237,0.47705,0.439212
924,BROADWHOLE PLANTAIN WHOLE,"[2, 18, 15, 1, 4, 23, 8, 15, 12, 5, 27, 16, 12...",1,SERENOA REPENS,PAUSINYSTALIA JOHIMBE,HOREHOUND,MARRUBIUM VULGARE,TANACETUM PARTHENIUM,0.541875,0.441626,0.431367,0.400456,0.383071
1153,CAPSELLA BURSAPASTORIS MEDIKUS,"[3, 1, 16, 19, 5, 12, 12, 1, 27, 2, 21, 18, 19...",1,BOSWELLIA SERRATA,BOSWELLIA,WOODLAND HAWTHORN,CHELIDONIUM MAJUS,CATSCLAW,0.860807,0.822396,0.590471,0.547766,0.477174
1678,CITRUS X AMARA WHOLE,"[3, 9, 20, 18, 21, 19, 27, 24, 27, 1, 13, 1, 1...",1,CHAMOMILE,UNCARIA TOMENTOSA,CATSCLAW,MARRUBIUM VULGARE,PIPER METHYSTICUM,0.851832,0.849611,0.677654,0.376571,0.313687
1270,CAPSELLA SEGETUM WHOLE,"[3, 1, 16, 19, 5, 12, 12, 1, 27, 19, 5, 7, 5, ...",1,UNCARIA TOMENTOSA,CATSCLAW,CHAMOMILE,BOSWELLIA,SERENOA REPENS,0.740044,0.407453,0.368543,0.33091,0.328314
3361,LIQUIRITIA OFFICINARUM WHOLE,"[12, 9, 17, 21, 9, 18, 9, 20, 9, 1, 27, 15, 6,...",1,TARAXACUM OFFICINALE,WOOD SPIDER,MIRACLEFRUIT,MATRICARIA CHAMOMILLA,ALLIUM SATIVUM,0.829126,0.81679,0.721304,0.655483,0.629518


In [64]:
# test["Processed_np_name"] = test.np_name.apply(clean).apply(encode).apply(padding)

In [65]:
# test = preprocessInput(unmapped, converters = {"drug_name_original":str}, skip_blank_lines=True, na_filter=True, na_values="")
# test["Processed_drug_name_original"] = test[test.columns[0]].apply(clean).apply(encode).apply(padding)