In [74]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, clear_output
import re

import skopt
from skopt import BayesSearchCV
from skopt import gp_minimize, forest_minimize
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_convergence
from skopt.plots import plot_objective, plot_evaluations
from skopt.plots import plot_histogram, plot_objective_2D
from skopt.utils import use_named_args
from tqdm import tqdm
#!pip install scikit-optimize

In [5]:
#if starting from preprocess, load the drugs directly
df = pd.read_csv('data/NP_FAERS_mapped_20220215.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5358 entries, 0 to 5357
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  5358 non-null   object
 1   lookup_value      5358 non-null   object
dtypes: object(2)
memory usage: 83.8+ KB


In [6]:
#make all casing upper
df['FAERS_drug_match'] = df['FAERS_drug_match'].str.upper()
df['lookup_value'] = df['lookup_value'].str.upper()

In [7]:
import tensorflow as tf
import string
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [9]:
# how i add noise 
# string w 
# proportion of noise added 
# uniform random from [0,1]
# if <1/3 edit one position with new random character, else if <2/3 delete one position, else add one random character 
def add_noise(w, percent):
  ''' edit, del, add'''
  positions = random.choices(range(len(w)), k=int(percent*len(w)))
  for p in positions:
    r = random.random()
    if r <= 0.3333: # edit
      w = w[:p] + random.choice(string.ascii_uppercase) + w[p+1:]
    elif r<= 0.6667: # delete
      w = w[:p] + w[p+1:]
    elif r<=1: # add
      w = w[:p] + random.choice(string.ascii_uppercase) + w[p:]
  return w

def clean(text):
    #remove all non-ascii, special characters and keep alphabets and space only. Can also use isalpha()
    #convert to uppercase
    #remove extra spaces
    regex = re.compile('[^a-zA-Z ]')
    r = regex.sub('', text)
    result = re.sub(' +', ' ', r)
    result = result.strip()
    return result.upper()
def clean_dataset(data):
  x = []
  y = []
  for i in range(data.shape[0]):
    w = clean(data.FAERS_drug_match.iloc[i])
    v = clean(data.lookup_value.iloc[i])
    x.append(w)
    y.append(v)
  return x,y

def encode_dataset(x,y):
  encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}
  Xtrain = [[encode_dict[m] for m in n] for n in x]
  Ytrain = [[encode_dict[m] for m in n] for n in y]
  return Xtrain, Ytrain

def clean_encode_padding(q, maxlen):
  q = clean(q)
  encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}
  return tf.keras.preprocessing.sequence.pad_sequences(
    [encode_dict[m] for m in q] , padding="post", maxlen=maxlen)

def padding_dataset(X,Y,maxlen):
  padded_y = tf.keras.preprocessing.sequence.pad_sequences(
    Y, padding="post", maxlen=maxlen)
  padded_x = tf.keras.preprocessing.sequence.pad_sequences(
      X, padding="post", maxlen=maxlen)
  return padded_x, padded_y

def cosine_distance(vects):
    x, y = vects
    return 1-tf.reduce_sum(tf.multiply(x,y),axis=1, keepdims=True)/(tf.norm(x,axis=1,keepdims=True)*tf.norm(y,axis=1,keepdims=True))

def loss(margin=1):
    def contrastive_loss(y_true, y_pred):
        square_pred = tf.math.square(y_pred)
        margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
        return tf.math.reduce_mean(
            (1 - y_true) * square_pred + (y_true) * margin_square
        )

    return contrastive_loss

def plt_metric(history, metric, title, has_valid=True):
    """Plots the given 'metric' from 'history'.

    Arguments:
        history: history attribute of History object returned from Model.fit.
        metric: Metric to plot, a string value present as key in 'history'.
        title: A string to be used as title of plot.
        has_valid: Boolean, true if valid data was passed to Model.fit else false.

    Returns:
        None.
    """
    plt.plot(history[metric])
    if has_valid:
        plt.plot(history["val_" + metric])
        plt.legend(["train", "validation"], loc="upper left")
    plt.title(title)
    plt.ylabel(metric)
    plt.xlabel("epoch")
    plt.show()


In [97]:
train, test = train_test_split(df, test_size=0.20, random_state=42)

In [98]:
train.shape

(4286, 2)

In [99]:
test.shape

(1072, 2)

In [105]:
#padding length = maxlen
maxlen = 400

In [106]:
xtest, ytest = clean_dataset(test)
Xtest, Ytest = encode_dataset(xtest,ytest)
padded_xTest, padded_yTest = padding_dataset(Xtest,Ytest,maxlen)

In [107]:
padded_xTest.shape

(1072, 400)

In [108]:
#without noise 
x, y = clean_dataset(train)

In [109]:
len(y)

4286

In [110]:
encode_dict = {l:i+1 for i,l in enumerate(string.ascii_uppercase + " ")}

In [111]:
Xtrain = [[encode_dict[m] for m in n] for n in x]
Ytrain = [[encode_dict[m] for m in n] for n in y]

In [112]:
print(len(Xtrain))
print(len(Ytrain))
np.unique(Ytrain).shape

4286
4286


  return array(a, dtype, copy=False, order=order, subok=True)


(125,)

In [113]:
padded_y = tf.keras.preprocessing.sequence.pad_sequences(
    Ytrain, padding="post", maxlen=maxlen
)
padded_x = tf.keras.preprocessing.sequence.pad_sequences(
    Xtrain, padding="post", maxlen=maxlen
)

print(padded_y)

[[19  5 18 ...  0  0  0]
 [15 16  8 ...  0  0  0]
 [ 3  9 14 ...  0  0  0]
 ...
 [ 1 12 12 ...  0  0  0]
 [ 1 12 12 ...  0  0  0]
 [ 8  5 13 ...  0  0  0]]


In [114]:
len(encode_dict)

27

In [115]:
dUnique_df = pd.DataFrame(columns = ['dUnique_label','dUnique_seq', 'dUnique_seq_padded'])
dUnique_df['dUnique_label'] = np.unique(y)
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,,
1,AESCULUS HIPPOCASTANUM,,
2,ALLIUM SATIVUM,,
3,ALOE VERA,,
4,ANGELICA SINENSIS,,


In [116]:
dUnique_seq_list = [[encode_dict[m] for m in n] for n in dUnique_df['dUnique_label'].tolist()]
len(dUnique_seq_list)

125

In [117]:
for i in range(len(dUnique_df.index)):
    dUnique_df.at[i, 'dUnique_seq'] = np.array(dUnique_seq_list[i])
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,"[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1...",
1,AESCULUS HIPPOCASTANUM,"[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...",
2,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...",
3,ALOE VERA,"[1, 12, 15, 5, 27, 22, 5, 18, 1]",
4,ANGELICA SINENSIS,"[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...",


In [118]:
#setup negative pairs
dfneg = pd.read_csv('data/NP_FAERS_negative_pairs_20220215.csv')
dfneg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9242 entries, 0 to 9241
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FAERS_drug_match  9242 non-null   object
 1   lookup_value      9242 non-null   object
dtypes: object(2)
memory usage: 144.5+ KB


In [119]:
#encode negative pairs and add padding
xneg, yneg = clean_dataset(dfneg)
Xneg, Yneg = encode_dataset(xneg,yneg)
padded_xneg, padded_yneg = padding_dataset(Xneg,Yneg,maxlen)

In [120]:
dUnique = tf.keras.preprocessing.sequence.pad_sequences(
    list(dUnique_df['dUnique_seq']), padding="post", maxlen=maxlen)

In [121]:
train.head()

Unnamed: 0,FAERS_drug_match,lookup_value
3956,SERENOA REPENS/SERENOA REPENS EXTRACT/SERENOA ...,SERENOA REPENS
4408,CORDYCEPS,OPHIOCORDYCEPS SINENSIS
120,CINNAMON (CINNAMOUM VERUM) (CAPSULES),CINNAMON
3311,CHROMIUM PICOLINATE WITH GREEN TEA,CAMELLIA SINENSIS
1921,BARLEY.,BARLEY GRASS


In [122]:
np_unique = dUnique_df.dUnique_label.tolist()
len(np_unique)

125

In [123]:
#add positive pairs
x1TrainRNN = []
x2TrainRNN = []
yTrainRNN = []
for i in range(len(padded_x)):
    yTrainRNN.append(1)
    x1TrainRNN.append(padded_x[i])
    x2TrainRNN.append(padded_y[i])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

4286 4286 4286


In [124]:
train_res = train.reset_index()
train_res.head()

Unnamed: 0,index,FAERS_drug_match,lookup_value
0,3956,SERENOA REPENS/SERENOA REPENS EXTRACT/SERENOA ...,SERENOA REPENS
1,4408,CORDYCEPS,OPHIOCORDYCEPS SINENSIS
2,120,CINNAMON (CINNAMOUM VERUM) (CAPSULES),CINNAMON
3,3311,CHROMIUM PICOLINATE WITH GREEN TEA,CAMELLIA SINENSIS
4,1921,BARLEY.,BARLEY GRASS


In [125]:
#take negative pairs from training data
faers_match = []
lookup = []
for i in range(len(train_res)):
    np_name = train_res.at[i, 'FAERS_drug_match']
    for j in random.choices(range(len(np_unique)), k=4):
        np_temp = np_unique[j]
        np_match = train_res.loc[train_res['FAERS_drug_match'] == np_name].lookup_value.tolist()
        if np_temp not in np_match:
            faers_match.append(np_name)
            lookup.append(np_temp)
len(faers_match)

16917

In [126]:
dfneg2 = pd.DataFrame(columns=['FAERS_drug_match', 'lookup_value'])
dfneg2['FAERS_drug_match'] = faers_match
dfneg2['lookup_value'] = lookup
xneg2, yneg2 = clean_dataset(dfneg2)
Xneg2, Yneg2 = encode_dataset(xneg2,yneg2)
padded_xneg2, padded_yneg2 = padding_dataset(Xneg2,Yneg2,maxlen)

In [127]:
##add negative pairs from training data
for j in range(len(padded_xneg2)):
    yTrainRNN.append(0)
    x1TrainRNN.append(padded_xneg2[j])
    x2TrainRNN.append(padded_yneg2[j])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

21203 21203 21203


In [128]:
##add negative pairs from reference set
for j in range(len(padded_xneg)):
    yTrainRNN.append(0)
    x1TrainRNN.append(padded_xneg[j])
    x2TrainRNN.append(padded_yneg[j])
print(len(x1TrainRNN), len(x2TrainRNN), len(yTrainRNN))

30445 30445 30445


In [129]:
for i in range(len(dUnique_df.index)):
    dUnique_df.at[i, 'dUnique_seq_padded'] = dUnique[i]
dUnique_df.head()

Unnamed: 0,dUnique_label,dUnique_seq,dUnique_seq_padded
0,ACTAEA RACEMOSA,"[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1...","[1, 3, 20, 1, 5, 1, 27, 18, 1, 3, 5, 13, 15, 1..."
1,AESCULUS HIPPOCASTANUM,"[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16...","[1, 5, 19, 3, 21, 12, 21, 19, 27, 8, 9, 16, 16..."
2,ALLIUM SATIVUM,"[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2...","[1, 12, 12, 9, 21, 13, 27, 19, 1, 20, 9, 22, 2..."
3,ALOE VERA,"[1, 12, 15, 5, 27, 22, 5, 18, 1]","[1, 12, 15, 5, 27, 22, 5, 18, 1, 0, 0, 0, 0, 0..."
4,ANGELICA SINENSIS,"[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1...","[1, 14, 7, 5, 12, 9, 3, 1, 27, 19, 9, 14, 5, 1..."


In [130]:
x1TrainRnnS, x1ValRnnS, x2TrainRnnS, x2ValRnnS, yTrainRnnS, yValRnnS  = train_test_split(x1TrainRNN, x2TrainRNN, yTrainRNN, test_size=0.20, random_state=42)

### build model and train

In [156]:
def build_model(model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin):
    input_x = tf.keras.layers.Input(maxlen)
    input_1 = tf.keras.layers.Input(maxlen)
    input_2 = tf.keras.layers.Input(maxlen)
    embedding = tf.keras.layers.Embedding(input_dim=28, output_dim=embedding_dim, mask_zero=True)
    x = embedding(input_x)
    
    if model_type == "lstm":
        x = tf.keras.layers.LSTM(num_rnn_node)(x)
    elif model_type=="gru":
        x = tf.keras.layers.GRU(num_rnn_node)(x)
     
    num = num_dense_node
    for _ in range(num_layer):
        x = tf.keras.layers.Dense(num, activation=activation_fn)(x)
        num /= 2
        
    embedding_network = tf.keras.Model(input_x, x)

    tower_1 = embedding_network(input_1)
    tower_2 = embedding_network(input_2)

    merge_layer = tf.keras.layers.Lambda(cosine_distance)([tower_1, tower_2])
    output_layer = tf.keras.layers.Dense(1, activation="sigmoid")(merge_layer)
    contr = tf.keras.Model(inputs=[input_1, input_2], outputs=output_layer)
    
    if optimizer == "Adam":
        opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    elif optimizer =="RMSprop":                
        opt = tf.keras.optimizers.RMSprop(learning_rate=learning_rate)
    contr.compile(loss=loss(margin= margin), optimizer=opt, metrics=["accuracy"])
    return contr

In [157]:
dim_model = Categorical(categories=["lstm", "gru"], name="model_type")
dim_embedding = Integer(low=64, high=512, prior="log-uniform", base=2, name="embedding_dim")
dim_rnn_node = Integer(low=128, high=1024, prior="log-uniform", base=2, name="num_rnn_node")
dim_num_dense_nodes = Integer(low=64, high=512, prior="log-uniform", base=2, name="num_dense_node")
dim_num_layer = Integer(low=1, high=5, prior="uniform", name="num_layer")
dim_activation = Categorical(categories=['tanh', 'relu'], name="activation_fn")
dim_lr = Real(low=1e-5, high=1e-1, prior="log-uniform", base=10, name="learning_rate")
dim_opt = Categorical(categories=['Adam', 'RMSprop'], name="optimizer")
dim_margin = Real(low=1e-1, high=1, prior="uniform", name="margin")
dim_batch_size = Integer(low=4, high=64, prior="log-uniform", base=2, name="batch_size")
dims = [dim_model, dim_embedding, dim_rnn_node, dim_num_dense_nodes, dim_num_layer, dim_activation, dim_lr, dim_opt, dim_margin, dim_batch_size]
default_params = ["gru", 64, 128, 64, 2, "tanh", 1e-3, "RMSprop", 1, 32]

In [160]:
def log_dir_name(a):
    o = ""
    for n in a:
        o+= str(n)+"_"
    return o+".log"

In [163]:
@use_named_args(dimensions=dims)
def fitness(model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin, batch_size):
    
    print("model:", model_type)
    print("embedding_dim:", embedding_dim)
    print("num_rnn_node:", num_rnn_node)
    print("num_dense_node:", num_dense_node)
    print("num_layer:", num_layer)
    print("activation_fn:", activation_fn)
    print("learning rate: {:.1e}".format(learning_rate))
    print("optimizer:", optimizer)
    print("margin:", margin)
    print("batch_size:", batch_size)

    model = build_model(model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin)

    log_dir = log_dir_name([model_type, embedding_dim, num_rnn_node, num_dense_node, num_layer, activation_fn, learning_rate, optimizer, margin, batch_size])
    callback_log = tf.keras.callbacks.TensorBoard(
        log_dir=log_dir,
        histogram_freq=0,
        write_graph=True,
        write_grads=False,
        write_images=False)
   
    history = model.fit(x=[np.array(x1TrainRnnS), np.array(x2TrainRnnS)],
                        y=np.array(yTrainRnnS, dtype=np.float32),
                        epochs=100,
                        batch_size=batch_size,
                        validation_data=([np.array(x1ValRnnS), np.array(x2ValRnnS)], np.array(yValRnnS, dtype=np.float32)),
                        callbacks=[callback_log])


    loss = history.history['val_loss'][-1]
    print()
    print("Val loss: {0:.6%}".format(loss))
    print()

    global lowest_loss

    if loss < lowest_loss:
        model.save("exp3_model/exp3.h5")
        lowest_loss = loss

    del model    
    K.clear_session()
    
    return loss

In [164]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [None]:
fitness(x=default_params)

model: gru
embedding_dim: 64
num_rnn_node: 128
num_dense_node: 64
num_layer: 2
activation_fn: tanh
learning rate: 1.0e-03
optimizer: RMSprop
margin: 1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100

In [95]:
tf.__version__

'2.5.0'

In [None]:
# https://colab.research.google.com/github/Hvass-Labs/TensorFlow-Tutorials/blob/master/19_Hyper-Parameters.ipynb#scrollTo=FuSgyvM5UMhy

In [None]:
search_result = gp_minimize(func=fitness, dimensions=dims, acq_func='EI', n_calls=40, x0=default_params)

In [None]:
plot_convergence(search_result)

In [None]:
search_result.x
space = search_result.space
space.point_to_dict(search_result.x)

In [None]:
sorted(zip(search_result.func_vals, search_result.x_iters))

In [None]:
fig = plot_objective_2D(result=search_result,
                        dimension_name1='learning_rate',
                        dimension_name2='num_rnn_node',
                        levels=50)

In [None]:
dim_names = ['learning_rate', 'num_dense_nodes', 'num_dense_layers']
fig, ax = plot_objective(result=search_result, dimension_names=dim_names)

In [None]:
dim_names = ['learning_rate', 'num_dense_nodes', 'num_dense_layers']

In [None]:
# siamese.save('models/NP_siamese_exp2_20220216.h5')
# siamese.save('models/NP_siamese_exp2_20220216')