In [2]:
# import numpy as np
# import pickle
import yaml
import random
import string
import pandas as pd
import re
import tensorflow as tf
from typing import Literal  
# tf.enable_eager_execution()
# tf.executing_eagerly()

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [3]:
with open('GRU65.yaml', 'r') as file:
    model_config = yaml.safe_load(file)

In [4]:
#Maximum sequence length including padding
global MAXLEN
MAXLEN = model_config['MAXLEN']

In [5]:
global encode_dict 
encode_dict = {l:i for i,l in enumerate(string.ascii_uppercase + " ", 1)}

# Functions

In [6]:
# TEST_TXT = "Eirmod horrida ingénii pariant secundum? Cognitionem compositis conséquat dicantur exercitus, intellegitur invenire negat oportet sapientium suam. Ceteris diu erat fecerit, impéndéré intelleges máerores malorum mei re reprehendunt? Constringendos intus mentitum quale urna! Convenire cotidie dixit malé vigiliae?"

In [7]:
def clean(text: str) -> str:
    '''Removes all the non-ascii and special characters from a string and returns the string's alphabetichal characters with spaces.
    
    Expects a string to be cleaned and removes all the non-ascii and special characters. 
    This is done by applying a substitution to regex matches
    Returns the cleaned string containing uppercased versions of the characters.
    
    Parameters
    ----------
    text : str
        
    Returns
    -------
    text : str
    '''
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()

In [8]:
# clean(TEST_TXT)

In [9]:
def add_noise(w: str, percent: float = 0.1) -> str:
    ''' Adds a specified proportion of noise to a string.
    
    Expects a string and a number stating the percent of noise to add to this string.
    The string is modified by editing, deleting, or adding characters in/to the string.
    The modification to perform is determined randomly by generating a random number from an uniform distribution [0,1].
    If the number is < 1/3 edit one position with new random character or space.
    If the number is < 2/3 delete one position.
    Finally, if the number is > 2/3 add one random character or space.
    
    In order to retain the length of the sequence compliant with the maximum sequence length,
    additional processing has been added such that sequences that reach the maximum sequence length
    can only be modified by removing or swapping characters.
    
    Parameters
    ----------
    w : str
        The string to add noise to.
    
    percent: float, defaults to 10% if not specified
        Percentange representing the proportion of noise to add to the string.
        
        
    Returns
    -------
    w : str
        Modified string with noise added.
    '''  
    positions = random.choices(range(len(w)), k=int(percent*len(w)))
#     print("Adding noise to", int(percent*len(w)), "% of the string")
    for p in positions:
        r = random.uniform(0,1)
        if len(w) < MAXLEN:
            # if <1/3 edit one position with new random character, # else if <2/3 delete one position, else add one random character 
            if r <= 0.3333: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            elif r<= 0.6667: # delete
                w = w[:p] + w[p+1:]
            else: # add
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p:]
        else:
            if r <= 0.5: # edit
                w = w[:p] + random.choice(string.ascii_uppercase + " ") + w[p+1:]
            else: # delete
                w = w[:p] + w[p+1:]
            
    return w

In [10]:
def preprocessInput(filename: str, maxlen: int = MAXLEN, **kwargs) -> pd.DataFrame:
    '''Preprocess CSV file into a Pandas DataFrame.
    
    Expects the file name or path of a csv file with named columns containing strings representing product names.
    It then removes the sequences with length greater than the maximun sequence length, cleans the sequences and
    uppercases them, and it finally drops any duplicates that might have arrisen from this processing.
    Returns a Pandas Dataframe containing unique cleaned and uppercased versions of the strings on each cell.
    
    Parameters
    ----------
    filename : str
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    df = pd.read_csv(filename)
    print(df.info())
    
    print("Processing file: ----------------------------------------")
    
    original_count = len(df.index)
    print("Dropping sequences longer than the maxlen:")
    for column in df.columns:
        df.drop(df[df[column].apply(len).gt(maxlen)].index, inplace = True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "that exceeded the maximum sequence length.")
    
    print("\tCleaning string sequences:")
    df = df.applymap(clean)
    
    print("\tUppercasing string sequences:")
    df = df.applymap(lambda x: str.upper(x))
    
    print("Dropping duplicate sequences:")
    original_count = len(df.index)
    df.drop_duplicates(ignore_index=True, inplace=True)
    new_count = len(df.index)
    print("\tDropped", original_count - new_count, "duplicate sequences.")
    
    print("Done processing: ---------------------------------------")
    print(df.info())
    return df

In [10]:
def encode_pad_tag(df: pd.DataFrame, match: Literal[0,1], maxlen: int = MAXLEN) -> pd.DataFrame:
    '''It encodes, pads and tags the preprocessed sequences in a Pandas DataFrame.
    
    Expects a pandas dataframe with cleaned and uppercased sequences 
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing sequences.
    
    match: Literal[0 | 1]
        Tag indicating wether sequences match 1 indicates 'yes' and 0 indicates 'no'. 
        
    maxlen:
        
    Returns
    -------
    df : Pandas DataFrame
    '''  
    print("Encoding and Padding: ----------------------------------")
    for column in df.columns:
        print("\tProcessing {}".format(column))
        df["Processed_" + column] = df[column].apply(lambda string: list(map(encode_dict.get, string))).transform(lambda x: x + ([0]* (maxlen-len(x))))
    print("Tagging: -----------------------------------------------")
    df["Match"] = match
    return df

In [11]:
def generate_noisy_positive_pairs(df: pd.DataFrame, scale: float, noise_percent: float) -> pd.DataFrame:
    '''Create negative pairs where 'FAERS_drug_match' does not match the correct 'lookup_value'.

    For each unique name in the 'FAERS_drug_match' column of the train set, get the product name
    and then pick four random different product names. For each of those 4 additional product names 
    check if it matches any of the names in the training set if its not then add it to the dataset as 
    a negative pair. The goal of this is to help further distance the embeddings in the vector space.

    
    Parameters
    ----------
    The function has no parameters but it expects a Pandas dataframe called Unique_df
    containing the 'dUnique_label' series and another Pandas dataframe called train
    containing the 'FAERS_drug_match' and the 'lookup_value' series.

    Returns
    -------
    faers_match : pd.Series
        Returns the 'FAERS_drug_match' series.
    lookup : pd.Series 
        Returns the 'lookup_value' series.
    '''
    noisy = pd.DataFrame(columns=['FAERS_drug_match', 'lookup_value'])
    faers_match = []
    lookup = []
    
    if scale >= 1.0:
        for i in range(int(scale)):
            faers_match.extend(df['FAERS_drug_match'].apply(lambda x: add_noise(x, noise_percent)).to_list())
            lookup.extend(df['lookup_value'].to_list())
    
    remainder = scale - int(scale)
    if remainder > 0:
        remaining_sample = df.sample()
        faers_match.extend(remaining_sample['FAERS_drug_match'].apply(lambda x: add_noise(x, noise_percent)).to_list())
        lookup.extend(remaining_sample['lookup_value'].to_list())

    noisy['FAERS_drug_match'] = faers_match
    noisy['lookup_value'] = lookup
    del faers_match
    del lookup
    encode_pad_tag(noisy, 1)
    return noisy


In [12]:
def generate_synthethic_negative_pairs(df: pd.DataFrame, scale: int = 4) -> pd.DataFrame:
    '''Create negative pairs where 'FAERS_drug_match' does not match the correct 'lookup_value'.

    For each unique name in the 'FAERS_drug_match' column of the train set, get the product name
    and then pick four random different product names. For each of those 4 additional product names 
    check if it matches any of the names in the training set if its not then add it to the dataset as 
    a negative pair. The goal of this is to help further distance the embeddings in the vector space.

    
    Parameters
    ----------
    The function has no parameters but it expects a Pandas dataframe called Unique_df
    containing the 'dUnique_label' series and another Pandas dataframe called train
    containing the 'FAERS_drug_match' and the 'lookup_value' series.

    Returns
    -------
    faers_match : pd.Series
        Returns the 'FAERS_drug_match' series.
    lookup : pd.Series 
        Returns the 'lookup_value' series.
    '''
    synthethic = pd.DataFrame(columns=['FAERS_drug_match', 'lookup_value'])
    faers_match = []
    lookup = []

    for np_name in df['FAERS_drug_match']:
        np_temp = dUnique_df['dUnique_label'][dUnique_df['dUnique_label'] != np_name].sample(scale)
        np_temp = np_temp[~np_temp.isin(df['lookup_value'].loc[df['FAERS_drug_match'] == np_name])]
        faers_match.extend([np_name]* len(np_temp))
        lookup.extend(np_temp)

    synthethic['FAERS_drug_match'] = faers_match
    synthethic['lookup_value'] = lookup
    del faers_match
    del lookup
    encode_pad_tag(synthethic, 0)
    return synthethic


------------------------------------------------

# Data loading and preprocessing

In [13]:
positve_pairs = '../data/NP_FAERS_mapped_20220215.csv'
negative_pairs = '../data/NP_FAERS_negative_pairs_20220222.csv'

# fName_unmapped = '../unmapped_data/upper_unmap_orig_drug_names_202201201812.csv'
unmapped = '../data/upper_unmap_orig-_drug_names_no_model_overlap_20220224.csv'

# Process Data

## Add Mapped-positive pairs
Data from NP_FAERS_mapped_20220215.csv -- the manually create references set for ~70 drugs

In [14]:
positive_set = encode_pad_tag(preprocessInput(positve_pairs), 1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB
None
Processing file: ----------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 374 that exceeded the maximum sequence length.
	Uppercasing string sequences:
	Cleaning string sequences:
Dropping duplicate sequences:
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  4502 non-null   object
 1   lookup_value      4502 non-null   object
dtypes: object(2)
memory usage: 70.5+ KB
None
Encoding and Padding: ---

In [15]:
positive_set.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1


## Creating Pandas DF for simplified view of the dataset 

In [16]:
dUnique_df = pd.DataFrame(columns = ['dUnique_label', 'dUnique_seq_padded'])
dUnique_df['dUnique_label'] = positive_set['lookup_value'].unique()
dUnique_df['dUnique_seq_padded'] = dUnique_df['dUnique_label'].transform(lambda x: list(map(encode_dict.get,list(x)))).transform(lambda x: x + ([0]* (MAXLEN-len(x))))
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq_padded
0,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ..."
1,BUTCHERSBROOM,"[2, 21, 20, 3, 8, 5, 18, 19, 2, 18, 15, 15, 13..."
2,CATSCLAW,"[3, 1, 20, 19, 3, 12, 1, 23, 0, 0, 0, 0, 0, 0,..."
3,CINNAMON,"[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0..."
4,FENUGREEK,"[6, 5, 14, 21, 7, 18, 5, 5, 11, 0, 0, 0, 0, 0,..."


## Add negative pairs from unmmaped
Data from NP_FAERS_negative_pairs_20220222.csv -- the negative pairs created by random sampling from the NP_FAERS_mapped_20220215.csv

In [17]:
negative_set = encode_pad_tag(preprocessInput(negative_pairs), 0)
negative_set.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9242 entries, 0 to 9241
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  9242 non-null   object
 1   lookup_value      9242 non-null   object
dtypes: object(2)
memory usage: 144.5+ KB
None
Processing file: ----------------------------------------
Dropping sequences longer than the maxlen:
	Dropped 1372 that exceeded the maximum sequence length.
	Uppercasing string sequences:
	Cleaning string sequences:
Dropping duplicate sequences:
Done processing: ---------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7372 entries, 0 to 7371
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  7372 non-null   object
 1   lookup_value      7372 non-null   object
dtypes: object(2)
memory usage: 115.3+ KB
None
Encoding and Padding: 

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0


## Generate additional sythenthic negative pairs

## Add additional negative pairs 
Generated from the positive data using generate_synthethic_negative_pairs

In [18]:
synth_negatives = generate_synthethic_negative_pairs(positive_set, 3)
synth_negatives.head()

Encoding and Padding: ----------------------------------
	Processing FAERS_drug_match
	Processing lookup_value
Tagging: -----------------------------------------------


Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ASHWAGANDHA,HORSETAIL,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[8, 15, 18, 19, 5, 20, 1, 9, 12, 0, 0, 0, 0, 0...",0
1,ASHWAGANDHA,ST JOHNSWORT,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[19, 20, 27, 10, 15, 8, 14, 19, 23, 15, 18, 20...",0
2,ASHWAGANDHA,SWALLOWWORT,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[19, 23, 1, 12, 12, 15, 23, 23, 15, 18, 20, 0,...",0
3,ASHWAGANDHA EXTRACT,CRATAEGUS LAEVIGATA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[3, 18, 1, 20, 1, 5, 7, 21, 19, 27, 12, 1, 5, ...",0
4,ASHWAGANDHA EXTRACT,AESCULUS HIPPOCASTANUM,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...",0


## Generate additional noisy posity pairs

In [19]:
noisy_positives = generate_noisy_positive_pairs(positive_set, 4.0, 0.15)
noisy_positives.head()

Encoding and Padding: ----------------------------------
	Processing FAERS_drug_match
	Processing lookup_value
Tagging: -----------------------------------------------


Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ASHWAGADHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 4, 8, 1, 0, 0, 0, 0, 0...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
1,ASHWAGLANDHA EXTRALT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 12, 1, 14, 4, 8, 1, 27, 5...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
2,ASHWAGANDHA RONYTAEXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
3,ASHAVANDHA WITHANIAA SOMNIFERD,ASHWAGANDA,"[1, 19, 8, 1, 22, 1, 14, 4, 8, 1, 27, 23, 9, 2...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
4,ASHWAGANDH WITHANIA SOGMNIFERA ROT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 27, 23, 9, 2...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1


# Merge all dataframes

In [20]:
positive_set.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ASHWAGANDHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
1,ASHWAGANDHA EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
2,ASHWAGANDHA ROOT EXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
3,ASHWAGANDHA WITHANIA SOMNIFERA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
4,ASHWAGANDHA WITHANIA SOMNIFERA ROOT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 23, 9...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1


In [21]:
negative_set.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ANUSOLHC BENZYL BENZOATEBISMUTH HYDROXIDE,CINNAMON,"[1, 14, 21, 19, 15, 12, 8, 3, 27, 2, 5, 14, 26...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
1,ASCABIOL BENZYL BENZOATE,CINNAMON,"[1, 19, 3, 1, 2, 9, 15, 12, 27, 2, 5, 14, 26, ...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
2,CASSIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
3,CASSIA ACUTIFOLIA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 3, 21, 20, 9, 6, 1...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0
4,CASSIA ALATA,CINNAMON,"[3, 1, 19, 19, 9, 1, 27, 1, 12, 1, 20, 1, 0, 0...","[3, 9, 14, 14, 1, 13, 15, 14, 0, 0, 0, 0, 0, 0...",0


In [22]:
synth_negatives.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ASHWAGANDHA,HORSETAIL,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[8, 15, 18, 19, 5, 20, 1, 9, 12, 0, 0, 0, 0, 0...",0
1,ASHWAGANDHA,ST JOHNSWORT,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[19, 20, 27, 10, 15, 8, 14, 19, 23, 15, 18, 20...",0
2,ASHWAGANDHA,SWALLOWWORT,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 0, 0, 0, ...","[19, 23, 1, 12, 12, 15, 23, 23, 15, 18, 20, 0,...",0
3,ASHWAGANDHA EXTRACT,CRATAEGUS LAEVIGATA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[3, 18, 1, 20, 1, 5, 7, 21, 19, 27, 12, 1, 5, ...",0
4,ASHWAGANDHA EXTRACT,AESCULUS HIPPOCASTANUM,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 5, 24...","[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...",0


In [23]:
noisy_positives.head()

Unnamed: 0,FAERS_drug_match,lookup_value,Processed_FAERS_drug_match,Processed_lookup_value,Match
0,ASHWAGADHA,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 4, 8, 1, 0, 0, 0, 0, 0...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
1,ASHWAGLANDHA EXTRALT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 12, 1, 14, 4, 8, 1, 27, 5...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
2,ASHWAGANDHA RONYTAEXTRACT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 1, 27, 18, 1...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
3,ASHAVANDHA WITHANIAA SOMNIFERD,ASHWAGANDA,"[1, 19, 8, 1, 22, 1, 14, 4, 8, 1, 27, 23, 9, 2...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1
4,ASHWAGANDH WITHANIA SOGMNIFERA ROT,ASHWAGANDA,"[1, 19, 8, 23, 1, 7, 1, 14, 4, 8, 27, 23, 9, 2...","[1, 19, 8, 23, 1, 7, 1, 14, 4, 1, 0, 0, 0, 0, ...",1


In [24]:
complete_dataset = pd.concat([positive_set, negative_set, synth_negatives, noisy_positives], ignore_index=True)

In [25]:
complete_dataset["Match"].value_counts()

1    22510
0    20642
Name: Match, dtype: int64

## Create the train/test split

In [26]:
train, split = train_test_split(complete_dataset, test_size=0.20, random_state = 42)

In [27]:
test, validate = train_test_split(split, test_size=0.10, random_state = 42)

In [28]:
print("Train:", train.shape, "Test:" , test.shape, "Validate:", validate.shape)

Train: (34521, 5) Test: (7767, 5) Validate: (864, 5)


In [29]:
x1TrainRnnS, x1ValRnnS, x2TrainRnnS, x2ValRnnS, yTrainRnnS, yValRnnS = train_test_split(train['Processed_FAERS_drug_match'], train['Processed_lookup_value'], train['Match'], test_size=0.20, random_state=42)

In [30]:
print("Tower 1:", len(x1TrainRnnS), "Tower 2:", len(x1ValRnnS))
print("Tower 1:", len(x2TrainRnnS), "Tower 2:", len(x2ValRnnS))
print("Tower 1:", len(yTrainRnnS),  "Tower 2:", len(yValRnnS))

Tower 1: 27616 Tower 2: 6905
Tower 1: 27616 Tower 2: 6905
Tower 1: 27616 Tower 2: 6905


# Save data to csv

In [31]:
pd.DataFrame(data={"x1": x1TrainRnnS, "x2": x2TrainRnnS, "y": yTrainRnnS}).to_pickle("../data/proccesed_train_set.pkl")
pd.DataFrame(data={"x1": x1ValRnnS, "x2": x2ValRnnS, "y": yValRnnS}).to_pickle("../data/proccesed_test_set.pkl")

In [32]:
dUnique_df.to_pickle("../data/dUnique_df.pkl")
negative_set.to_pickle("../data/dfneg2.pkl")
train.to_pickle("../data/train.pkl")
test.to_pickle("../data/test.pkl")
validate.to_pickle("../data/validate.pkl")

----------------------------------

In [33]:
pd.read_csv('../data/positive-unmapped-pairs-common-name-or-latin-binomial-copies.tsv', sep="\t").head()

Unnamed: 0,common_name,latin_binomial
0,PRUNUS SPINOSA,PRUNUS SPINOSA
1,PHYTOLACCA AMERICANA,PHYTOLACCA AMERICANA
2,SMILAX PSEUDOCHINA,SMILAX PSEUDOCHINA
3,HARUNGANA MADAGASCARIENSIS,HARUNGANA MADAGASCARIENSIS
4,QUERCUS LOBATA,QUERCUS LOBATA


In [34]:
pd.read_csv('../data/negative-unmapped-pairs-all.tsv', sep="\t").head()

Unnamed: 0,p1,upper
0,BURSA ORIENTALIS WHOLE,TILIA AMERICANA
1,KERNERIA LEUCANTHEMA WHOLE,SCUTELLARIA LATERIFLORA
2,EPIMEDIUM GRANDIFLORUM,URTICA DIOICA
3,BAUHINIA FURCATA WHOLE,CYPERUS ROTUNDUS
4,ERYTHRAEA SHUTTLEWORTHIANA WHOLE,ALOYSIA TRIPHYLLA
