# Imports & Globals

In [1]:
import yaml
import random
import string
import pandas as pd
import re
import tensorflow as tf
from typing import Literal  
# tf.enable_eager_execution()
# tf.executing_eagerly()

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [2]:
with open('LSTM65.yaml', 'r') as file:
    model_config = yaml.safe_load(file)

In [3]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = model_config['MAXLEN']

In [4]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

# Functions

In [5]:
# TEST_TXT = "Eirmod horrida ingénii pariant secundum? Cognitionem compositis conséquat dicantur exercitus, intellegitur invenire negat oportet sapientium suam. Ceteris diu erat fecerit, impéndéré intelleges máerores malorum mei re reprehendunt? Constringendos intus mentitum quale urna! Convenire cotidie dixit malé vigiliae?"

In [6]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [7]:
# clean(TEST_TXT)

In [8]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character or space.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character or space.
    
    In order to retain the length of the sequence compliant with the maximum sequence length,
    additional processing has been added such that sequences that reach the maximum sequence length
    can only be modified by removing or swapping characters.
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
#     print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        if len(w) < MAXLEN:
            # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
            if r <= 0.3333: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            elif r<= 0.6667: # delete
                w = w[:p] + w[p+1:]
            else: # add
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p:]
        else:
            if r <= 0.5: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            else: # delete
                w = w[:p] + w[p+1:]
            
    return w

In [9]:
def preprocessInput(filename: str, maxlen: int = MAXLEN, reflexive: bool = False, **kwargs) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    It then removes the sequences with length greater than the maximun sequence length, cleans the sequences and
    uppercases them, and it finally drops any duplicates that might have arrisen from this processing.
    Returns a Pandas Dataframe containing unique cleaned and uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
    
    maxlen : int
    
    reflexive: bool
        For every pair (x,y) ensure (y,x) is also in the set.
    
    **kwargs:
        Keyword arguments for pandas read csv function. 
    
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename, **kwargs)
    print(df.info())
    
    
    print("Processing file: ----------------------------------------")
    
    print("Renaming colums:")
    print("\tCurrent names: {}".format(df.columns))
    df.columns = ["x", "y"]
    print("\tNew names: {}".format(df.columns))
    
    original_count = len(df.index)
    print("Dropping row with empty cells:")
    df.dropna(subset=df.columns, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "rows with empty cells.")
    
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen of {}:".format(maxlen))
    for column in df.columns:
        df.drop(df[df[column].apply(len).gt(maxlen)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    
    print("\tCleaning string sequences.")
    df = df.applymap(clean)
    
    print("\tUppercasing string sequences.")
    df = df.applymap(lambda x: str.upper(x))
    
    if reflexive and len(df.columns) == 2:
        df = pd.concat(
            [df,
            pd.DataFrame.from_dict({"x": df['y'].to_list(), "y": df['x'].to_list()})],
            axis = 0
        )
        
    print("Dropping duplicate sequences:")
    original_count = len(df.index)
    df.drop_duplicates(ignore_index=True, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "duplicate sequences.")
    
    print("Done processing: ---------------------------------------")
    print(df.info())
    return df

In [10]:
def encode_pad_tag(df: pd.DataFrame, match: Literal[0,1], distance: Literal[0,1], maxlen: int = MAXLEN) -> pd.DataFrame:
    '''It encodes, pads and tags the preprocessed sequences in a Pandas DataFrame.
    
    Expects a pandas dataframe with cleaned and uppercased sequences. It processes the 
    the DataFrame by creating an additional 'Processed_' + current column name for each
    of the columns in the data frame, where each of the sequences in the column get 
    transformed from a string sequence to an encoded sequence and then transformed again 
    by padding the encoded sequences up to the maximun sequence length by 0's as needed. 
    The tag component 1 
    Finally, this function returns this dataframe with both the original and processed columns.
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing sequences.
    
    match: Literal[0 | 1]
        Tag indicating wether sequences match 1 indicates 'yes' and 0 indicates 'no'.
        
    distance: Literal[0 | 1]
        Distance between sequences, 1 indicates 'far' and 0 indicates 'close'. 
        
    maxlen: int
        Dafault to global MAXLEN value. It's used to determine the ammount of padding to add
        to sequences smaller than maxlen.
        
    Returns
    -------
    df : Pandas DataFrame
        A copy of the origininal DataFrame with the processed sequences added as new columns.
    '''  
    print("Encoding and Padding: ----------------------------------")
    for column in df.columns:
        print("\tProcessing {}".format(column))
        df["Processed_" + column] = df[column].apply(lambda string: list(map(encode_dict.get, string))).transform(lambda x: x + ([0]* (maxlen-len(x))))
    print("Tagging: -----------------------------------------------")
    df["Match"] = match
    df["Distance"] = distance
    return df

In [11]:
def generate_noisy_positive_pairs(df: pd.DataFrame, scale: float, noise_percent: float = 0.10) -> pd.DataFrame:
    '''Creates noisy positive pairs by adding some noise to the sequences in the 'FAERS_drug_match'
    column while retaining the match to the correct 'lookup_value'.

    For each unique name in the 'FAERS_drug_match' column of the train set, get the product name
    and apply noise to a specified percentage of the sequence. Finally, return a DataFrame with the 
    generated synthethic noisy sequences encoded, padded and tagged.
        
    Parameters
    ----------
    df: pd.DataFrame
        A Pandas DataFrame containing the 'FAERS_drug_match' and the 'lookup_value' Series.
        
    scale: float
        The scale of data to be generated relative to the size of the true positives. 
        Eg. scale = 1.0 generates approximately a 1:1 DataFrame with equivalent noisy 
        sequences relative to the input DataFrame's true positives. And scale = 2.50 
        generates a 2.5:1 meaning an output Dataframe of 2 times and a half times the 
        size of the input DataFrame.
        
    noise_percent: float
        Defaults to adding noise to 10% of the character sequence. Indicates the approximate
        percentage of noise to add to each sequence. 
    

    Returns
    -------
    df : pd.DataFrame
        Returns a DataFrame containing the 'FAERS_drug_match', 'lookup_value', 'Processed_FAERS_drug_match', 'Processed_lookup_value' and 'Match' pd.Series from the synthetic data.
    '''
    noisy = pd.DataFrame(columns=['x', 'y'])
    faers_match = []
    lookup = []
    
    if scale >= 1.0:
        for i in range(int(scale)):
            faers_match.extend(df['x'].apply(lambda x: add_noise(x, noise_percent)).to_list())
            lookup.extend(df['y'].to_list())
    
    remainder = scale - int(scale)
    if remainder > 0:
        remaining_sample = df.sample(frac=remainder)
        faers_match.extend(remaining_sample['x'].apply(lambda x: add_noise(x, noise_percent)).to_list())
        lookup.extend(remaining_sample['y'].to_list())

    noisy['x'] = faers_match
    noisy['y'] = lookup
    del faers_match
    del lookup
    encode_pad_tag(noisy,  match=1, distance = 0)
    return noisy


In [12]:
def generate_synthethic_negative_pairs(df: pd.DataFrame, scale: int = 4) -> pd.DataFrame:
    '''Create negative pairs where 'FAERS_drug_match' does not match the correct 'lookup_value'.

    For each unique name in the 'FAERS_drug_match' column of the train set, get the product name
    and then pick four random different product names. For each of those 4 additional product names 
    check if it matches any of the names in the training set if its not then add it to the dataset as 
    a negative pair. The goal of this is to help further distance the embeddings in the vector space.
    Returns a DataFrame containing those negative sequences encoded, padded and tagged.

    
    Parameters
    ----------
    df: pd.DataFrame
         A Pandas DataFrame containing the 'FAERS_drug_match' and the 'lookup_value' Series.
         
    scale: int
        Defaults to 4.
        The scale of data to be generated relative to the size of the true positives. 
        Eg. scale = 4 generates approximately a 4:1 DataFrame with equivalent true 
        negative pairs relative to the input DataFrame's true positives. And scale = 2
        generates a 2:1 meaning an output Dataframe ~2 times the size of the input DataFrame.
    
    Returns
    -------
     df : pd.DataFrame
        Returns a DataFrame containing the 'FAERS_drug_match', 'lookup_value', 'Processed_FAERS_drug_match', 'Processed_lookup_value' and 'Match' pd.Series from the synthetic data.
    '''
    synthethic = pd.DataFrame(columns=['x', 'y'])
    faers_match = []
    lookup = []

    unique_targets = pd.DataFrame(df['y'].unique(), columns=['Targets'])
    for np_name in df['x']:
        np_temp = unique_targets['Targets'][unique_targets['Targets'] != np_name].sample(scale)
        np_temp = np_temp[~np_temp.isin(df['y'].loc[df['x'] == np_name])]
        faers_match.extend([np_name]* len(np_temp))
        lookup.extend(np_temp)

    synthethic['x'] = faers_match
    synthethic['y'] = lookup
    del faers_match
    del lookup
    encode_pad_tag(synthethic, match = 0, distance = 1)
    return synthethic


------------------------------------------------

# Data loading and preprocessing

In [13]:
# fName_unmapped = '../unmapped_data/upper_unmap_orig_drug_names_202201201812.csv'
unmapped = '../data/upper_unmap_orig-_drug_names_no_model_overlap_20220224.csv'

# Process Data

## Add Mapped-positive pairs
Data from NP_FAERS_mapped_20220215.csv -- the manually create references set for ~70 drugs

In [14]:
positive_set = encode_pad_tag(preprocessInput('../data/NP_FAERS_mapped_20220215.csv'), match=1, distance=0)
positive_set.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['FAERS_drug_match', 'lookup_value'], dtype='object')
	New names: Index(['x', 'y'], dtype='object')
Dropping row with empty cells:
	Dropped 0 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 374 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 482 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 2 columns):
 #   Column

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0


## Add some Unmapped gsrs positive pairs
Data from 4) positive-unmapped-pairs-gsrs-name-to-common-name.tsv -- POSITIVE unmapped pairs G-SRS to Latin binomial common name

In [15]:
common_2_latin = encode_pad_tag(preprocessInput('../data/positive-unmapped-pairs-common-name-or-latin-binomial-copies.tsv', reflexive=True, sep='\t'), match=1, distance=0)
common_2_latin.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728 entries, 0 to 727
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   common_name     728 non-null    object
 1   latin_binomial  728 non-null    object
dtypes: object(2)
memory usage: 11.5+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['common_name', 'latin_binomial'], dtype='object')
	New names: Index(['x', 'y'], dtype='object')
Dropping row with empty cells:
	Dropped 0 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 0 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 728 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728 entries, 0 to 727
Data columns (total 2 columns):
 #   Column  Non-Null Count 

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,PRUNUS SPINOSA,PRUNUS SPINOSA,"[16, 18, 21, 14, 21, 19, 27, 19, 16, 9, 14, 15...","[16, 18, 21, 14, 21, 19, 27, 19, 16, 9, 14, 15...",1,0
1,PHYTOLACCA AMERICANA,PHYTOLACCA AMERICANA,"[16, 8, 25, 20, 15, 12, 1, 3, 3, 1, 27, 1, 13,...","[16, 8, 25, 20, 15, 12, 1, 3, 3, 1, 27, 1, 13,...",1,0
2,SMILAX PSEUDOCHINA,SMILAX PSEUDOCHINA,"[19, 13, 9, 12, 1, 24, 27, 16, 19, 5, 21, 4, 1...","[19, 13, 9, 12, 1, 24, 27, 16, 19, 5, 21, 4, 1...",1,0
3,HARUNGANA MADAGASCARIENSIS,HARUNGANA MADAGASCARIENSIS,"[8, 1, 18, 21, 14, 7, 1, 14, 1, 27, 13, 1, 4, ...","[8, 1, 18, 21, 14, 7, 1, 14, 1, 27, 13, 1, 4, ...",1,0
4,QUERCUS LOBATA,QUERCUS LOBATA,"[17, 21, 5, 18, 3, 21, 19, 27, 12, 15, 2, 1, 2...","[17, 21, 5, 18, 3, 21, 19, 27, 12, 15, 2, 1, 2...",1,0


## Add some positive unmmaped common and latin-binomial pairs
Data from 5) positive-unmapped-pairs-common-name-or-latin-binomial-copies.tsv -- POSITIVE unmapped pairs common-name to common name, common name to Latin binomial and vice versa, Latin binomial to Latin binomial

In [16]:
gsrs_2_common = encode_pad_tag(preprocessInput('../data/positive-unmapped-pairs-gsrs-name-to-common-name.tsv', maxlen= MAXLEN, reflexive=True, sep='\t'),  match=1, distance=0)
gsrs_2_common.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10838 entries, 0 to 10837
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         10838 non-null  object
 1   common_name  199 non-null    object
dtypes: object(2)
memory usage: 169.5+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['name', 'common_name'], dtype='object')
	New names: Index(['x', 'y'], dtype='object')
Dropping row with empty cells:
	Dropped 10639 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 0 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 0 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  --

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,BERBERIS SERRATIFOLIA WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 19, 5, 18, 18,...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
1,BERBERIS SIKKIMENSIS WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 19, 9, 11, 11,...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
2,BERBERIS UNDULATA WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 21, 14, 4, 21,...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
3,BERBERIS CERATOPHYLLA WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 3, 5, 18, 1, 2...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
4,BERBERIS ARISTATA WHODD,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 1, 18, 9, 19, ...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0


## Add negative pairs from unmmaped
Data from 9) NP_FAERS_negative_pairs_20220222.csv -- the negative pairs created by random sampling from the NP_FAERS_mapped_20220215.csv

In [17]:
negative_set = encode_pad_tag(preprocessInput('../data/NP_FAERS_negative_pairs_20220222.csv'), match=0, distance=1)
negative_set.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9242 entries, 0 to 9241
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  9242 non-null   object
 1   lookup_value      9242 non-null   object
dtypes: object(2)
memory usage: 144.5+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['FAERS_drug_match', 'lookup_value'], dtype='object')
	New names: Index(['x', 'y'], dtype='object')
Dropping row with empty cells:
	Dropped 0 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 1372 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 498 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7372 entries, 0 to 7371
Data columns (total 2 columns):
 #   Colu

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1


## Add negative pairs from unmmaped
Data from 6) negative-unmapped-pairs-all.tsv -- NEGATIVE pairs for the umpapped NP strings from GSRS

In [18]:
gsrs_2_common_or_latin_negatives = encode_pad_tag(preprocessInput('../data/negative-unmapped-pairs-all.tsv', sep="\t"), match=0, distance=1)
gsrs_2_common_or_latin_negatives.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11964 entries, 0 to 11963
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   p1      11964 non-null  object
 1   upper   11964 non-null  object
dtypes: object(2)
memory usage: 187.1+ KB
None
Processing file: ----------------------------------------
Renaming colums:
	Current names: Index(['p1', 'upper'], dtype='object')
	New names: Index(['x', 'y'], dtype='object')
Dropping row with empty cells:
	Dropped 0 rows with empty cells.
Dropping sequences longer than the maxlen of 65:
	Dropped 2 that exceeded the maximum sequence length.
	Cleaning string sequences.
	Uppercasing string sequences.
Dropping duplicate sequences:
	Dropped 0 duplicate sequences.
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11962 entries, 0 to 11961
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,BURSA ORIENTALIS WHOLE,TILIA AMERICANA,"[2, 21, 18, 19, 1, 27, 15, 18, 9, 5, 14, 20, 1...","[20, 9, 12, 9, 1, 27, 1, 13, 5, 18, 9, 3, 1, 1...",0,1
1,KERNERIA LEUCANTHEMA WHOLE,SCUTELLARIA LATERIFLORA,"[11, 5, 18, 14, 5, 18, 9, 1, 27, 12, 5, 21, 3,...","[19, 3, 21, 20, 5, 12, 12, 1, 18, 9, 1, 27, 12...",0,1
2,EPIMEDIUM GRANDIFLORUM,URTICA DIOICA,"[5, 16, 9, 13, 5, 4, 9, 21, 13, 27, 7, 18, 1, ...","[21, 18, 20, 9, 3, 1, 27, 4, 9, 15, 9, 3, 1, 0...",0,1
3,BAUHINIA FURCATA WHOLE,CYPERUS ROTUNDUS,"[2, 1, 21, 8, 9, 14, 9, 1, 27, 6, 21, 18, 3, 1...","[3, 25, 16, 5, 18, 21, 19, 27, 18, 15, 20, 21,...",0,1
4,ERYTHRAEA SHUTTLEWORTHIANA WHOLE,ALOYSIA TRIPHYLLA,"[5, 18, 25, 20, 8, 18, 1, 5, 1, 27, 19, 8, 21,...","[1, 12, 15, 25, 19, 9, 1, 27, 20, 18, 9, 16, 8...",0,1


## Generate additional sythenthic negative pairs

## Add additional negative pairs 
Generated from the positive data using generate_synthethic_negative_pairs

In [19]:
synth_negatives = generate_synthethic_negative_pairs(pd.concat([positive_set, common_2_latin, gsrs_2_common], ignore_index=True), 4)
synth_negatives.head()

Encoding and Padding: ----------------------------------
	Processing x
	Processing y
Tagging: -----------------------------------------------


Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ASHWAGANDHA,PHYLLANTHUS AMARUS,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[16, 8, 25, 12, 12, 1, 14, 20, 8, 21, 19, 27, ...",0,1
1,ASHWAGANDHA,VITEX DONIANA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[22, 9, 20, 5, 24, 27, 4, 15, 14, 9, 1, 14, 1,...",0,1
2,ASHWAGANDHA,SABAL MINOR,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[19, 1, 2, 1, 12, 27, 13, 9, 14, 15, 18, 0, 0,...",0,1
3,ASHWAGANDHA,MAGNOLIA OFFICINALIS,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[13, 1, 7, 14, 15, 12, 9, 1, 27, 15, 6, 6, 9, ...",0,1
4,ASHWAGANDHA EXTRACT,TURKISH LICORICE WHOLE,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[20, 21, 18, 11, 9, 19, 8, 27, 12, 9, 3, 15, 1...",0,1


## Generate additional noisy positive pairs

In [20]:
noisy_positives = generate_noisy_positive_pairs(pd.concat([positive_set, common_2_latin, gsrs_2_common], ignore_index=True), scale = 8.0, noise_percent = 0.20)
noisy_positives.head()

Encoding and Padding: ----------------------------------
	Processing x
	Processing y
Tagging: -----------------------------------------------


Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ASHEAGANTDHA,ASHWAGANDA,"[1, 19, 8, 5, 1, 7, 1, 14, 20, 4, 8, 1, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
1,ASHAAGAANDHA EXTRAWT,ASHWAGANDA,"[1, 19, 8, 1, 1, 7, 1, 1, 14, 4, 8, 1, 27, 5, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
2,WASHWAGANDHA ROBOT EYXTYRACT,ASHWAGANDA,"[23, 1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
3,ASHWAGAPNDHA WTHANIAJRSOMHNIFESA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 16, 14, 4, 8, 1, 27, 2...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
4,ASHWGANDHAX WQTHTHAIARSOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 7, 1, 14, 4, 8, 1, 24, 27, 23, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0


# Merge all dataframes

In [21]:
positive_set.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0


In [22]:
common_2_latin.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,PRUNUS SPINOSA,PRUNUS SPINOSA,"[16, 18, 21, 14, 21, 19, 27, 19, 16, 9, 14, 15...","[16, 18, 21, 14, 21, 19, 27, 19, 16, 9, 14, 15...",1,0
1,PHYTOLACCA AMERICANA,PHYTOLACCA AMERICANA,"[16, 8, 25, 20, 15, 12, 1, 3, 3, 1, 27, 1, 13,...","[16, 8, 25, 20, 15, 12, 1, 3, 3, 1, 27, 1, 13,...",1,0
2,SMILAX PSEUDOCHINA,SMILAX PSEUDOCHINA,"[19, 13, 9, 12, 1, 24, 27, 16, 19, 5, 21, 4, 1...","[19, 13, 9, 12, 1, 24, 27, 16, 19, 5, 21, 4, 1...",1,0
3,HARUNGANA MADAGASCARIENSIS,HARUNGANA MADAGASCARIENSIS,"[8, 1, 18, 21, 14, 7, 1, 14, 1, 27, 13, 1, 4, ...","[8, 1, 18, 21, 14, 7, 1, 14, 1, 27, 13, 1, 4, ...",1,0
4,QUERCUS LOBATA,QUERCUS LOBATA,"[17, 21, 5, 18, 3, 21, 19, 27, 12, 15, 2, 1, 2...","[17, 21, 5, 18, 3, 21, 19, 27, 12, 15, 2, 1, 2...",1,0


In [23]:
gsrs_2_common.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,BERBERIS SERRATIFOLIA WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 19, 5, 18, 18,...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
1,BERBERIS SIKKIMENSIS WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 19, 9, 11, 11,...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
2,BERBERIS UNDULATA WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 21, 14, 4, 21,...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
3,BERBERIS CERATOPHYLLA WHOLE,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 3, 5, 18, 1, 2...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0
4,BERBERIS ARISTATA WHODD,BARBERRY,"[2, 5, 18, 2, 5, 18, 9, 19, 27, 1, 18, 9, 19, ...","[2, 1, 18, 2, 5, 18, 18, 25, 0, 0, 0, 0, 0, 0,...",1,0


In [24]:
negative_set.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0,1


In [25]:
gsrs_2_common_or_latin_negatives.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,BURSA ORIENTALIS WHOLE,TILIA AMERICANA,"[2, 21, 18, 19, 1, 27, 15, 18, 9, 5, 14, 20, 1...","[20, 9, 12, 9, 1, 27, 1, 13, 5, 18, 9, 3, 1, 1...",0,1
1,KERNERIA LEUCANTHEMA WHOLE,SCUTELLARIA LATERIFLORA,"[11, 5, 18, 14, 5, 18, 9, 1, 27, 12, 5, 21, 3,...","[19, 3, 21, 20, 5, 12, 12, 1, 18, 9, 1, 27, 12...",0,1
2,EPIMEDIUM GRANDIFLORUM,URTICA DIOICA,"[5, 16, 9, 13, 5, 4, 9, 21, 13, 27, 7, 18, 1, ...","[21, 18, 20, 9, 3, 1, 27, 4, 9, 15, 9, 3, 1, 0...",0,1
3,BAUHINIA FURCATA WHOLE,CYPERUS ROTUNDUS,"[2, 1, 21, 8, 9, 14, 9, 1, 27, 6, 21, 18, 3, 1...","[3, 25, 16, 5, 18, 21, 19, 27, 18, 15, 20, 21,...",0,1
4,ERYTHRAEA SHUTTLEWORTHIANA WHOLE,ALOYSIA TRIPHYLLA,"[5, 18, 25, 20, 8, 18, 1, 5, 1, 27, 19, 8, 21,...","[1, 12, 15, 25, 19, 9, 1, 27, 20, 18, 9, 16, 8...",0,1


In [26]:
synth_negatives.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ASHWAGANDHA,PHYLLANTHUS AMARUS,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[16, 8, 25, 12, 12, 1, 14, 20, 8, 21, 19, 27, ...",0,1
1,ASHWAGANDHA,VITEX DONIANA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[22, 9, 20, 5, 24, 27, 4, 15, 14, 9, 1, 14, 1,...",0,1
2,ASHWAGANDHA,SABAL MINOR,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[19, 1, 2, 1, 12, 27, 13, 9, 14, 15, 18, 0, 0,...",0,1
3,ASHWAGANDHA,MAGNOLIA OFFICINALIS,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[13, 1, 7, 14, 15, 12, 9, 1, 27, 15, 6, 6, 9, ...",0,1
4,ASHWAGANDHA EXTRACT,TURKISH LICORICE WHOLE,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[20, 21, 18, 11, 9, 19, 8, 27, 12, 9, 3, 15, 1...",0,1


In [27]:
noisy_positives.head()

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
0,ASHEAGANTDHA,ASHWAGANDA,"[1, 19, 8, 5, 1, 7, 1, 14, 20, 4, 8, 1, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
1,ASHAAGAANDHA EXTRAWT,ASHWAGANDA,"[1, 19, 8, 1, 1, 7, 1, 1, 14, 4, 8, 1, 27, 5, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
2,WASHWAGANDHA ROBOT EYXTYRACT,ASHWAGANDA,"[23, 1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
3,ASHWAGAPNDHA WTHANIAJRSOMHNIFESA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 16, 14, 4, 8, 1, 27, 2...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0
4,ASHWGANDHAX WQTHTHAIARSOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 7, 1, 14, 4, 8, 1, 24, 27, 23, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1,0


In [28]:
complete_dataset = pd.concat([positive_set, common_2_latin, gsrs_2_common, negative_set, gsrs_2_common_or_latin_negatives, synth_negatives, noisy_positives], ignore_index=True)
complete_dataset.drop_duplicates(subset=['x', 'y'], inplace=True)

In [29]:
complete_dataset["Distance"].value_counts()

0    49926
1    41590
Name: Distance, dtype: int64

## Creating Pandas DF for simplified view of the dataset 

In [30]:
dUnique_df = pd.DataFrame(columns = ['dUnique_label', 'dUnique_seq_padded'])
dUnique_df['dUnique_label'] = complete_dataset['y'].unique()
dUnique_df['dUnique_seq_padded'] = dUnique_df['dUnique_label'].transform(lambda x: list(map(encode_dict.get,list(x)))).transform(lambda x: x + ([0]* (MAXLEN-len(x))))
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq_padded
0,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,BUTCHERSBROOM,"[2, 21, 20, 3, 8, 5, 18, 19, 2, 18, 15, 15, 13..."
2,CATSCLAW,"[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,..."
3,CINNAMON,"[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0..."
4,FENUGREEK,"[6, 5, 14, 21, 7, 18, 5, 5, 11, 0, 0, 0, 0, 0,..."


In [31]:
dUnique_df.size

1974

## Create the train/test split

In [55]:
train, test = train_test_split(complete_dataset, test_size=0.025, random_state = 42)

In [56]:
# test, validate = train_test_split(split, test_size=0.01, random_state = 42)

In [57]:
print("Train:", train.shape, "Test:" , test.shape)
# print("Validate:", validate.shape)

Train: (89228, 6) Test: (2288, 6)


In [58]:
x1TrainRnnS, x1ValRnnS, x2TrainRnnS, x2ValRnnS, yTrainRnnS, yValRnnS = train_test_split(train['Processed_x'], train['Processed_y'], train['Distance'], test_size=0.20, random_state=42)

In [59]:
print("Train Tower 1 x:", len(x1TrainRnnS), ", Test Tower 1 x:", len(x1ValRnnS))
print("Train Tower 2 x:", len(x2TrainRnnS), ", Test Tower 2 x:", len(x2ValRnnS))
print("Train Target  y:", len(yTrainRnnS),  ", Test Target  y:", len(yValRnnS))

Train Tower 1 x: 71382 , Test Tower 1 x: 17846
Train Tower 2 x: 71382 , Test Tower 2 x: 17846
Train Target  y: 71382 , Test Target  y: 17846


# Save data to csv

In [60]:
save = input("Want to replace best model with this model? (y/n): ")

if save.lower() == "y": 
    pd.DataFrame(data={"x1": x1TrainRnnS, "x2": x2TrainRnnS, "y": yTrainRnnS}).to_pickle("../data/proccesed_train_set.pkl")
    pd.DataFrame(data={"x1": x1ValRnnS, "x2": x2ValRnnS, "y": yValRnnS}).to_pickle("../data/proccesed_test_set.pkl")
    dUnique_df.to_pickle("../data/dUnique_df.pkl")
    negative_set.to_pickle("../data/dfneg2.pkl")
    train.to_pickle("../data/train.pkl")
    test.to_pickle("../data/test.pkl")
#     validate.to_pickle("../data/validate.pkl")

Want to replace best model with this model? (y/n): y


-------

In [38]:
common_2_latin[common_2_latin["y"] == "KRATOM"]

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance


In [39]:
positive_set["y"].unique().flatten()

array(['ASHWAGANDA', 'BUTCHERSBROOM', 'CATSCLAW', 'CINNAMON', 'FENUGREEK',
       'FEVERFEW', 'FLAX SEED', 'GINGER', 'GREEN TEA', 'GUARANA',
       'HEMP EXTRACT', 'HORSECHESTNUT', 'KARCURA', 'KRATOM',
       'LIONS TOOTH', 'MACA', 'MIRACLEFRUIT', 'MORINGA', 'NIU BANG ZI',
       'PANAX GINSENG', 'ECHINACEA', 'REISHI', 'RHODIOLA',
       'SCRUBPALMETTO', 'SLIPPERY ELM', 'STINGING NETTLE', 'ST JOHNSWORT',
       'SWALLOWWORT', 'TANGKUEI', 'TULSI', 'WOODLAND HAWTHORN',
       'WOOD SPIDER', 'ALOE VERA', 'BEET ROOT', 'BLACK COHOSH',
       'CHAMOMILE', 'BLACK CHERRY', 'CORDYCEPS', 'ELDERBERRY', 'FENNEL',
       'GARCINIA', 'GOJI BERRY', 'HOREHOUND', 'HORSETAIL', 'IVY LEAF',
       'KAVA', 'MILK THISTLE', 'OLIVE LEAF', 'OREGANO', 'TURMERIC',
       'VALERIAN', 'WHEAT GRASS', 'BARLEY GRASS', 'YOHIMBE',
       'APPLE CIDER VINEGAR', 'BLACK CUMIN', 'BOSWELLIA', 'CHLORELLA',
       'CRANBERRY', 'EVENING PRIMROSE OIL', 'GARLIC', 'GRAPEFRUIT',
       'RED YEAST RICE', 'STEVIA', 'WITHANIA SOMNIFE

In [40]:
positive_set[positive_set["x"] == "KRATOM"]

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
754,KRATOM,KRATOM,"[11, 18, 1, 20, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0...","[11, 18, 1, 20, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0...",1,0
3027,KRATOM,MITRAGYNA SPECIOSA,"[11, 18, 1, 20, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0...","[13, 9, 20, 18, 1, 7, 25, 14, 1, 27, 19, 16, 5...",1,0


In [41]:
positive_set[positive_set["x"] == "MITRAGYNA SPECIOSA"]

Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
815,MITRAGYNA SPECIOSA,KRATOM,"[13, 9, 20, 18, 1, 7, 25, 14, 1, 27, 19, 16, 5...","[11, 18, 1, 20, 15, 13, 0, 0, 0, 0, 0, 0, 0, 0...",1,0
3088,MITRAGYNA SPECIOSA,MITRAGYNA SPECIOSA,"[13, 9, 20, 18, 1, 7, 25, 14, 1, 27, 19, 16, 5...","[13, 9, 20, 18, 1, 7, 25, 14, 1, 27, 19, 16, 5...",1,0


In [42]:
one2many = complete_dataset[complete_dataset["Distance"] == 0].groupby(["x"])['y'].count()
one2many[one2many > 1].size

2628

In [43]:
one2many[one2many > 1].describe()

count    2628.000000
mean        2.212709
std         1.679281
min         2.000000
25%         2.000000
50%         2.000000
75%         2.000000
max        44.000000
Name: y, dtype: float64

In [44]:
one2many[one2many > 1].unique()

array([ 2,  4,  8,  3, 20, 12,  6,  7,  9, 23, 16,  5, 10, 13, 11, 15, 44,
       21, 39])

In [45]:
one2many[one2many == 44]

x
LICORICE    44
Name: y, dtype: int64

In [46]:
complete_dataset[complete_dataset["Distance"] == 0][complete_dataset["x"] == "LICORICE"]

  complete_dataset[complete_dataset["Distance"] == 0][complete_dataset["x"] == "LICORICE"]


Unnamed: 0,x,y,Processed_x,Processed_y,Match,Distance
4544,LICORICE,GLYCYRRHIZA INFLATA,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 9,...",1,0
4740,LICORICE,GLYCYRRHIZA GLABRA,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 7,...",1,0
4753,LICORICE,GLYCYRRHIZA URALENSIS,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 21...",1,0
5206,LICORICE,LICORICE,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...",1,0
5551,LICORICE,GLYCYRRHIZA OFFICINALIS WHOLE,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 15...",1,0
5552,LICORICE,GLYCYRRHIZA BRACHYCARPA WHOLE,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 2,...",1,0
5553,LICORICE,SPANISH LIQUORICE WHOLE,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[19, 16, 1, 14, 9, 19, 8, 27, 12, 9, 17, 21, 1...",1,0
5554,LICORICE,GLYCYRRHIZA URALENSIS FISCH EX DC,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 21...",1,0
5555,LICORICE,GLYCYRRHIZA VULGARIS WHOLE,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[7, 12, 25, 3, 25, 18, 18, 8, 9, 26, 1, 27, 22...",1,0
5556,LICORICE,MERISTOTROPIS PAUCIFLORA WHOLE,"[12, 9, 3, 15, 18, 9, 3, 5, 0, 0, 0, 0, 0, 0, ...","[13, 5, 18, 9, 19, 20, 15, 20, 18, 15, 16, 9, ...",1,0
