In [None]:
#MMSEQS_FORCE_MERGE=1 foldseek/build/src/foldseek search 100k_sample 20k_random_sample_foldseek_shuffled 100k_random_sample_foldseek_shuffled_aln tmp -e 10000000 -s 9 --max-seqs 2000 --threads 128
##!/bin/bash
#INFILE="100k_random_sample_foldseek_shuffled_aln.m8"


#while read line; do
#  seq=$(echo $line| cut -d';' -f1)
#  score=$(echo $line| cut -d';' -f2)
#  if [ $(awk -F "," '{print NF-1}' <(echo $score)) -gt 10 ]
#  then
#          echo $seq" "$(./fiteval $score)
#  fi
#done < <(awk '$1!=prev{print seq";"score; score=""; seq=""}  {if(score!=""){score=score","$(NF-1)}else{score=$(NF-1)}; seq=$NF; prev=$1;}' 100k_random_sample_foldseek_shuffled_aln.m8)
#awk 'BEGIN{print "seq;lambda;mu"}{gsub("lamda=","",$2); gsub("mu=","",$3); print $1";"$2";"$3}'  100k_per_query_mu_lambda.m8 > 100k_per_query_mu_lambda.csv



In [6]:
!pip install -q -U keras-tuner

[?25l[K     |███▍                            | 10 kB 37.5 MB/s eta 0:00:01[K     |██████▊                         | 20 kB 8.5 MB/s eta 0:00:01[K     |██████████                      | 30 kB 7.4 MB/s eta 0:00:01[K     |█████████████▍                  | 40 kB 7.2 MB/s eta 0:00:01[K     |████████████████▊               | 51 kB 4.1 MB/s eta 0:00:01[K     |████████████████████            | 61 kB 4.3 MB/s eta 0:00:01[K     |███████████████████████▍        | 71 kB 4.5 MB/s eta 0:00:01[K     |██████████████████████████▊     | 81 kB 5.1 MB/s eta 0:00:01[K     |██████████████████████████████  | 92 kB 5.1 MB/s eta 0:00:01[K     |████████████████████████████████| 98 kB 3.4 MB/s 
[?25h

In [7]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers import Dense
from keras.layers import Input
from keras.layers import Dropout 
from keras.callbacks import EarlyStopping
import numpy as np
import matplotlib.pyplot as plt


# read csv file having three columns: seq as str, lambda as float, and mu as float
def prepare_input(filename):
    data = np.genfromtxt(filename, delimiter=';', skip_header=1, dtype=str)
    X = []
    y = []
    for i in range(0, len(data)):
        kmers = string_to_kmers(data[i][0], 2)
        counts1mer = add_1mer_counts(data[i][0])
        counts = np.append(counts1mer, [len(data[i][0])])
        #counts2mer = add_2mer_counts(kmers)
        #counts = np.append(counts1mer, counts2mer) 
        # append sequence length to counts
        #counts = np.append(counts, [len(data[i][0])])
        X.append(counts)
        y.append([float(data[i][1]), float(data[i][2])])
    return np.asarray(X), np.asarray(y)

# turn string into k-mers
def string_to_kmers(string, k):
    kmers = []
    for i in range(0, len(string) - k + 1, k):
        kmers.append(string[i:i+k])
    return kmers

# map of 20 amino acids (alphabetically) and X to integers
def map_to_int(amino_acid):
    alphabet = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X']
    return alphabet.index(amino_acid)

def add_1mer_counts(string):
  counts = np.zeros(21)
  for i in range(0, len(string)): 
    counts[map_to_int(string[i])] += 1
  return counts

# add 2-mer 21*21 integers
def add_2mer_counts(kmers):
    counts = np.zeros((21, 21))
    for kmer in kmers:
        counts[map_to_int(kmer[0])][map_to_int(kmer[1])] += 1
    return counts

# normalize Y values 
def normalize_y(y):
    y_norm = np.copy(y)
    mu1 = np.mean(y[:,0])
    sigma1 = np.std(y[:,0])
    mu2 = np.mean(y[:,1])
    sigma2 = np.std(y[:,1])
    for i in range(0, len(y)):
        y_norm[i][0] = (y[i][0] - mu1) / sigma1
        y_norm[i][1] = (y[i][1] - mu2) / sigma2
    return y_norm, mu1, sigma1, mu2, sigma2


def unnormalize_y(y_norm, mu1, sigma1, mu2, sigma2):
    y = np.copy(y_norm)
    for i in range(0, len(y)):
        y[i][0] = y_norm[i][0] * sigma1 + mu1
        y[i][1] = y_norm[i][1] * sigma2 + mu2
    return y


In [9]:
X,Y = prepare_input('/content/drive/MyDrive/100k_per_query_mu_lambda.csv')
Y_norm, mu1, sigma1, mu2, sigma2 = normalize_y(Y)
#Y_norm, min1, max1, min2, max2 = normalize_y_2(Y)

In [10]:
import keras_tuner as kt
# keras model with k-mer and seq len as input and two output nodes for regression
# Predict mu and lambda
def keras_model(hp):
    model = Sequential()
    hp_units = hp.Int('units', min_value=16, max_value=48, step=16)
    model.add(Dense(units=hp_units, activation="relu", input_shape=(22,)))
    layer_units = hp.Int('layer', min_value=0, max_value=2, step=1)
    for i in range(layer_units):
      model.add(Dense(units=hp_units, activation="relu"))
    model.add(Dense(units=2))
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-1, 1e-2, 1e-3, 1e-4, 1e-5])  

    model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),metrics=['accuracy'])
    return model



tuner = kt.Hyperband(keras_model,
                     objective='val_accuracy',
                     max_epochs=40,
                     factor=5,
                     directory='my_dir',
                     project_name='opt_hp_param_input22_simple3')

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
tuner.search(X, Y_norm, epochs=100, validation_split=0.2, callbacks=[stop_early])
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
The hyperparameter search is complete. 
Ynits: {best_hps.get('units')} Layer: {best_hps.get('layer')} :earning rate: {best_hps.get('learning_rate')}.
""")


#model = keras_model(X[0].shape, Y_norm[0].shape[0], 2)


Trial 48 Complete [00h 00m 52s]
val_accuracy: 0.9072861075401306

Best val_accuracy So Far: 0.9180377125740051
Total elapsed time: 00h 21m 06s
INFO:tensorflow:Oracle triggered exit

The hyperparameter search is complete. 
Ynits: 32 Layer: 2 :earning rate: 0.001.



In [11]:
# Build the model with the optimal hyperparameters and train it on the data for 50 epochs
model = tuner.hypermodel.build(best_hps)
history = model.fit(X, Y_norm, epochs=100, validation_split=0.2)

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X, Y_norm, epochs=best_epoch, validation_split=0.2)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fb82df2c090>

In [None]:
hypermodel.fit(X, Y_norm, epochs=100, validation_split=0.2)


In [12]:
# evaluate model
scores = hypermodel.evaluate(X, Y_norm, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


accuracy: 91.68%


In [None]:
predictions = model.predict(X)
    #plt.scatter(Y[:,0], predictions[:,0], color='red')
  #plt.scatter(Y[:,1], predictions[:,1], color='blue')
  #plt.show()

In [None]:
pred=unnormalize_y(predictions, mu1, sigma1, mu2, sigma2)
pred[:,1]

In [None]:
Y[:,1]

In [20]:
print(mu1, sigma1, mu2, sigma2)

0.25319026504505765 0.0870925635266098 16.48226262676655 3.472243709346458


In [15]:
seq = "DFDQVPDDPLLVLLLVVLLVVDDLVVSCVVVVHHSVVSVVSVVVSCVRRVQPDSPSSNVVCVVVVPD"
feats = []
counts1mer = add_1mer_counts(seq)
feats.append(np.append(counts1mer, [len(seq)]))
pred = hypermodel.predict(np.asarray(feats))
pred_norm = unnormalize_y(pred, mu1, sigma1, mu2, sigma2)
#print(pred_norm)

from math import exp, log
# compute pvalue from score using evd with mu lambda
def compute_pvalue(score, lambda_, mu):
    h = lambda_ * (score - mu)
    if h > 10:
      return -h 
    elif h < -2.5:
      return -exp(-exp(-h)) 
    else:
      return log((1.0 - exp(-exp(-h))))

logPval = compute_pvalue(84, pred_norm[0][0], pred_norm[0][1])
#logPval = compute_pvalue(80, 0.169, 18.92)
print(exp(logPval + log(9330577)))


0.0005788370215863978


In [17]:
!wget https://raw.githubusercontent.com/moof2k/kerasify/master/kerasify.py

--2022-01-19 05:21:21--  https://raw.githubusercontent.com/moof2k/kerasify/master/kerasify.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7789 (7.6K) [text/plain]
Saving to: ‘kerasify.py’


2022-01-19 05:21:22 (102 MB/s) - ‘kerasify.py’ saved [7789/7789]



In [18]:
from kerasify import export_model
export_model(hypermodel, '/content/drive/MyDrive/100k_per_query_mu_lambda.kerasify.model')