In [1]:

# for keras 2.3.1
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve,auc

import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"    
# CustomCNN.Adam.512_1211.V2.h5
# CustomCNN.RMSprop.512_1211.V2.h5

deployed_path = "/mnt/vdb/thesis/best_model_BLSTM.V5.set_10.hdf"
import tensorflow as tf
#learner = load_model(deployed_path)
with tf.device('/cpu:0'):
    learner = load_model(deployed_path)

In [3]:
def toPredictDF(path,to_dir,existing_result):
    appended_pkl = []
    for infile in glob.glob(path):
        #print("Read:",infile)
        file_name = os.path.basename(infile)
        result_path=to_dir+"/"+file_name.replace("pkl", "ML.pkl")
        if result_path in existing_result :
        # print("found then skip : " , result)
            continue
        else:
            df = pd.read_pickle(infile)
            ready_df =df[[ "reps"]]
            df_new = ready_df.reps.apply(pd.Series).astype(np.float64)
            df_new.columns = df_new.columns.astype(str)
            dl = learner.dls.test_dl(df_new)
            _preds,_none ,_y = learner.get_preds(dl=dl, with_decoded=True)
            df.drop(columns=['reps','length'],inplace =True)
            df['class'] = _y
            #print("Save:",result_path)
            df.to_pickle(result_path)
    print("Complete")

def mergeDF(path,to_dir,file_name):
    appended_data = []
    for infile in glob.glob(path):
        #print(infile)
        data = pd.read_pickle(infile)
        # store DataFrame in list
        appended_data.append(data)
    result_path=to_dir+"/"+file_name
    print("Save:",result_path)
    appended_data = pd.concat(appended_data)
    try:
        appended_data['ID'] = appended_data['ID'].astype(int)
    except:
        pass
    appended_data.sort_values(by=['ID'], inplace=True)
    appended_data.to_pickle(result_path)
    return appended_data

def predict_CNN(path,to_dir,existing_result):
    appended_pkl = []
    for infile in glob.glob(path):  
     #print("Read:",infile)
        file_name = os.path.basename(infile)
        result_path=to_dir+"/"+file_name.replace("pkl", "ML.pkl")
        if result_path in existing_result :
        # print("found then skip : " , result)
            continue
        else:
            df = pd.read_pickle(infile)
            ready_df =df[[ "reps"]]
            
            X= np.array(df['reps'].to_list())
            X_test = np.reshape(X,(X.shape[0],X.shape[1],1))
            y_probas = learner.predict(X_test)
            threshold = 0.22
            _y = np.where(y_probas > threshold, 1, 0)
            df.drop(columns=['reps'],inplace =True)
            df['class'] = _y
            #print("Save:",result_path)
            df.to_pickle(result_path) 

    print("Complete")



In [4]:
# summarize model.
learner.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 1900, 1)]         0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1900, 64)          2624      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 76, 64)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 76, 128)           57472     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 19, 128)           0         
_________________________________________________________________
dense_8 (Dense)              (None, 19, 1211)          156219    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 19, 128)          

# Truth set

In [8]:
df = pd.read_pickle("../datasets/truthset/AMPs_truthset.reps.plk")
X= np.array(df['reps'].to_list())
X_test = np.reshape(X,(X.shape[0],X.shape[1],1))
y_probas = learner.predict(X_test)
threshold = 0.22
_y = np.where(y_probas >= threshold, 1, 0)
df['class'] = _y

In [6]:
len(df[df["class"] == 0]) 

51

In [9]:
len(df[df["class"] == 0]) 

48

In [1]:
## pssm 

In [6]:
df = pd.read_pickle("../datasets/truthset/pssm/AMPs.truthset.reps.pkl")
X= np.array(df['reps'].to_list())
X_test = np.reshape(X,(X.shape[0],X.shape[1],1))
y_probas = learner.predict(X_test)
threshold = 0.5
_y = np.where(y_probas > threshold, 1, 0)
df['class'] = _y
len(df[df["class"] == 0]) 

62

## DECockRoach

In [5]:
path = "/mnt/vdb/DECockroach/cd100/result_5/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob("/mnt/vdb/DECockroach/cd100/result_5/result_BILSTM/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))



0


In [6]:
# Predict
predict_CNN("/mnt/vdb/DECockroach/cd100/reps/*.pkl","/mnt/vdb/DECockroach/cd100/result_5/result_BILSTM",existing_result)

Complete


In [7]:
# write result
result_df =mergeDF("/mnt/vdb/DECockroach/cd100/result_5/result_BILSTM/*.pkl","/mnt/vdb/DECockroach/cd100/result_5/result_BILSTM","DECockroach.len15.MLResult.plk")
result_df

Save: /mnt/vdb/DECockroach/cd100/result_5/result_BILSTM/DECockroach.len15.MLResult.plk


Unnamed: 0,ID,length,class
0,0,17,1
1,1,17,0
2,3,18,0
3,4,30,0
4,5,17,0
...,...,...,...
199,894298,17,0
200,894299,19,0
201,894303,17,1
202,894304,17,1


In [11]:
result_df[result_df["class"]== 1]

Unnamed: 0,ID,length,class
1,1,17,1
4,5,17,1
5,6,24,1
8,9,19,1
9,10,18,1
...,...,...,...
186,894277,48,1
191,894285,33,1
193,894288,17,1
195,894293,28,1


In [10]:
result_df[result_df["class"]== 1]

Unnamed: 0,ID,length,class
9,10,18,1
10,12,20,1
13,16,19,1
14,17,17,1
16,19,17,1
...,...,...,...
193,894288,17,1
195,894293,28,1
199,894298,17,1
201,894303,17,1


## BAT

In [8]:
path ="/mnt/vdb/Bat/cd100/result_5/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob("/mnt/vdb/Bat/cd100/result_5/result_BILSTM/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))

0


In [9]:
# Predict
predict_CNN("/mnt/vdb/Bat/cd100/reps/*.pkl",path,existing_result)

Complete


In [10]:
# write result
result_df =mergeDF(path+"/*.pkl",path,"Bat.len10.MLResult.plk")
result_df

Save: /mnt/vdb/Bat/cd100/result_5/result_BILSTM/Bat.len10.MLResult.plk


Unnamed: 0,ID,length,class
0,9,275,1
1,19,47,0
2,32,151,1
3,54,199,1
4,61,152,1
...,...,...,...
74,12568817,51,1
75,12568833,45,1
76,12568849,117,0
77,12568850,174,1


In [26]:
result_df[result_df["class"]== 1]

Unnamed: 0,ID,length,class
1,SOAP.k25.C303708.p1,102,1
2,SOAP.k25.C304032.p1,103,1
3,SOAP.k25.C304284.p1,104,1
4,SOAP.k25.C304386.p1,104,1
5,SOAP.k25.C305552.p1,108,1
...,...,...,...
1000,Velvet.k61.NODE_783_length_328_cov_36.378048.p1,106,1
1001,Velvet.k61.NODE_8470_length_279_cov_15.172043.p1,105,1
1002,Velvet.k61.NODE_9141_length_765_cov_44.260132.p2,233,1
1003,Velvet.k61.NODE_9415_length_1250_cov_53.216000.p1,367,1


# TransPI

## BAT

In [4]:
path ="/mnt/vdb/Bat/transpi/result_10_1/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob("/mnt/vdb/Bat/transpi/result_10_1/result_BILSTM/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))

0


In [5]:
# Predict
predict_CNN("/mnt/vdb/Bat/transpi/reps/*.pkl","/mnt/vdb/Bat/transpi/result_10_1/result_BILSTM",existing_result)

Complete


In [6]:
# write result
result_df =mergeDF("/mnt/vdb/Bat/transpi/result_10_1/result_BILSTM/*.pkl","/mnt/vdb/Bat/transpi/result_10_1/result_BILSTM","Bat.len10.MLResult.plk")
result_df

Save: /mnt/vdb/Bat/transpi/result_10_1/result_BILSTM/Bat.len10.MLResult.plk


Unnamed: 0,ID,length,class
0,SOAP.k25.C303246.p1,101,1
1,SOAP.k25.C303708.p1,102,1
2,SOAP.k25.C304032.p1,103,1
3,SOAP.k25.C304284.p1,104,1
4,SOAP.k25.C304386.p1,104,1
...,...,...,...
0,Velvet.k61.NODE_783_length_328_cov_36.378048.p1,106,1
1,Velvet.k61.NODE_8470_length_279_cov_15.172043.p1,105,1
2,Velvet.k61.NODE_9141_length_765_cov_44.260132.p2,233,1
3,Velvet.k61.NODE_9415_length_1250_cov_53.216000.p1,367,1


In [14]:
result_df[result_df["class"] == 1]

Unnamed: 0,ID,length,class
1,SOAP.k25.C303708.p1,102,1
2,SOAP.k25.C304032.p1,103,1
3,SOAP.k25.C304284.p1,104,1
4,SOAP.k25.C304386.p1,104,1
5,SOAP.k25.C305552.p1,108,1
...,...,...,...
999,Velvet.k61.NODE_7367_length_630_cov_23.734921.p1,193,1
1001,Velvet.k61.NODE_8470_length_279_cov_15.172043.p1,105,1
1002,Velvet.k61.NODE_9141_length_765_cov_44.260132.p2,233,1
1003,Velvet.k61.NODE_9415_length_1250_cov_53.216000.p1,367,1


In [8]:
result_df[result_df["class"] == 1]

Unnamed: 0,ID,length,class
0,SOAP.k25.C303246.p1,101,1
1,SOAP.k25.C303708.p1,102,1
2,SOAP.k25.C304032.p1,103,1
3,SOAP.k25.C304284.p1,104,1
4,SOAP.k25.C304386.p1,104,1
...,...,...,...
999,Velvet.k61.NODE_7367_length_630_cov_23.734921.p1,193,1
1000,Velvet.k61.NODE_783_length_328_cov_36.378048.p1,106,1
1001,Velvet.k61.NODE_8470_length_279_cov_15.172043.p1,105,1
1003,Velvet.k61.NODE_9415_length_1250_cov_53.216000.p1,367,1


## DECockroach

In [7]:
path ="/mnt/vdb/DECockroach/transpi/result_10_1/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)


existing_result = []
for infile in glob.glob("/mnt/vdb/DECockroach/transpi/result_10_1/result_BILSTM/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))

0


In [8]:
# Predict
predict_CNN("/mnt/vdb/DECockroach/transpi/reps/*.pkl","/mnt/vdb/DECockroach/transpi/result_10_1/result_BILSTM",existing_result)

Complete


In [9]:
# write result
result_df =mergeDF("/mnt/vdb/DECockroach/transpi/result_10_1/result_BILSTM/*.pkl","/mnt/vdb/DECockroach/transpi/result_10_1/result_BILSTM","DECockroach.len15.MLResult.plk")
result_df

Save: /mnt/vdb/DECockroach/transpi/result_10_1/result_BILSTM/DECockroach.len15.MLResult.plk


Unnamed: 0,ID,length,class
0,SOAP.k25.C372231.p1,102,1
1,SOAP.k25.C373809.p1,104,1
2,SOAP.k25.C379695.p1,95,0
3,SOAP.k25.C382451.p1,116,1
4,SOAP.k25.C383053.p1,104,1
...,...,...,...
54,Velvet.k37.NODE_9552_length_1035_cov_12.333333.p1,102,1
55,Velvet.k37.NODE_9713_length_2207_cov_17.057997.p1,627,1
56,Velvet.k37.NODE_9748_length_894_cov_40.512302.p1,124,1
57,Velvet.k37.NODE_9800_length_726_cov_45.530304.p1,221,1


In [28]:
result_df[result_df["class"] == 0]

Unnamed: 0,ID,length,class
9,SOAP.k25.C387573.p1,124,0
19,SOAP.k25.C397059.p1,115,0
23,SOAP.k25.C397729.p1,126,0
31,SOAP.k25.C405333.p1,144,0
32,SOAP.k25.C405897.p1,129,0
...,...,...,...
13,Velvet.k37.NODE_71696_length_822_cov_50.759125.p1,258,0
19,Velvet.k37.NODE_73436_length_1641_cov_16.12309...,514,0
28,Velvet.k37.NODE_7963_length_783_cov_48.134098.p1,157,0
44,Velvet.k37.NODE_8670_length_780_cov_48.342308.p1,254,0


# PWM

## CD100 BAT

In [9]:
path ="/mnt/vdb/Bat/pws/cd100/result/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob(path+"/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))
predict_CNN("/mnt/vdb/Bat/pws/cd100/reps/*.pkl",path,existing_result)
# write result
result_df =mergeDF(path+"/*.pkl",path,"Bat.len10.MLResult.plk")
result_df

0
Complete
Save: /mnt/vdb/Bat/pws/cd100/result/result_BILSTM/Bat.len10.MLResult.plk


Unnamed: 0,ID,Sequence,length,class
0,9,EDTGFYPSEPMLCSESEEGQVPHSLETLYQSADCSSPSDALIVCIH...,275,0
1,19,ALGPSLWDRRRSLHLLLQEAFPVAQSLAQVIHHQFQTVSKQGGPLP,47,1
2,32,VGICGSDVHYWQHGRIGDFIVKKPMVLGHEASGTVVKVGSLVKHLQ...,151,1
3,54,EKPCNSNQQPLENLVEDTLINYSQFGSPKDHEHNGCKLCQTDRYCE...,199,1
4,61,AAKFVFRHNDPDHLEKLLKKSNSETPKIVAFETVHSMDGAICPLEE...,152,1
...,...,...,...,...
17611,12568817,GAWTEVGLPSQDVSVASCNCCRRPMHFELMSEWERSYFGNMGPQYV...,51,1
17612,12568833,LLLPLKVLGFLGGQLSVVVLHQPVHIIIIQLQAMDLEIFSSFAP,45,1
17613,12568849,GGQTPRTAPNPQNPPLPPFWAWSSQNVQFSFKFLILGAEPLDRFLA...,117,1
17614,12568850,VEESCTIENNSDSTKPKMAAEVDFGDLELFEAFDHPEESLPKPVHT...,174,1


## CD100 DECockroach

In [10]:
path ="/mnt/vdb/DECockroach/pws/cd100/result/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob(path+"/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))
predict_CNN("/mnt/vdb/DECockroach/pws/cd100/reps/*.pkl",path,existing_result)
# write result
result_df =mergeDF(path+"/*.pkl",path,"DECockroach.len15.MLResult.plk")
result_df

24
Complete
Save: /mnt/vdb/DECockroach/pws/cd100/result/result_BILSTM/DECockroach.len15.MLResult.plk


Unnamed: 0,ID,Sequence,length,class
0,0,SSRHLQWDSTLSPHFCF,17,1
1,1,RGRCIRSSCPSSHLGWR,17,0
2,3,YSKFSLCQYYFKFCPYST,18,0
3,4,AADYKPEGLGFDSPWCHWNFSIGNPSGRTM,30,1
4,5,AIPSLPHSRRLWSPHHL,17,0
...,...,...,...,...
26102,894298,IHCEEQQCSVRYVWCRR,17,0
26103,894299,WFDYFVSFHPFNLRNWLA,19,1
26104,894303,WPCTHNGDPCTHFGFSA,17,0
26105,894304,ETCGNQCNCDCADAGIN,17,1


## Transpi  BAT

In [5]:
path ="/mnt/vdb/Bat/pws/transpi/result/result_9/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob(path+"/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))
predict_CNN("/mnt/vdb/Bat/pws/transpi/reps/*.pkl",path,existing_result)
# write result
result_df =mergeDF(path+"/*.pkl",path,"Bat.len10.MLResult.plk")
result_df

0
Complete
Save: /mnt/vdb/Bat/pws/transpi/result/result_9/result_BILSTM/Bat.len10.MLResult.plk


Unnamed: 0,ID,Sequence,length,class
0,SOAP.k25.C303246.p1,QKVLQAAGPSTTTETETIAKYEIMDGAPVKGESIPIRLFLAGYDPT...,101,0
1,SOAP.k25.C303708.p1,PITWGRKWNIENGCARTHSQDDYSPGSQAQGESGTASHPRRGHLEM...,102,0
2,SOAP.k25.C304032.p1,LRNHSPLMSFGASFVSFLNAMMTFEEEKMQLACDDLRTTEKLCESE...,103,0
3,SOAP.k25.C304284.p1,ESTDQISPYGNSTVTQPSDSGWQYNETHTSLKQNTPRNTSKLYIGL...,104,0
4,SOAP.k25.C304386.p1,LGPDLSWAWEAKQPWGQETSLRRGEGSGLCKVGGVRVCAPPLLTPK...,104,0
...,...,...,...,...
419,Velvet.k61.NODE_783_length_328_cov_36.378048.p1,MSPSQAVYIVPSKGRLIGGLRDTPSYEHFQEDFSTCSLCTFRDLCA...,106,0
420,Velvet.k61.NODE_8470_length_279_cov_15.172043.p1,QDLENAATGDAAVHQRIASLPVEVQEVSLLDKITDKESGERLSKMV...,105,0
421,Velvet.k61.NODE_9141_length_765_cov_44.260132.p2,ARAALAMPVKGGTKCIKYLLLGFNFVFWLAGIAVLAIGLWLRFDSQ...,233,0
422,Velvet.k61.NODE_9415_length_1250_cov_53.216000.p1,MSCKPQCSLNHLPTPCARQSAPFRIPAEFLYLVLLLVEGAPFSNFS...,367,0


## Transpi DECockroach

In [7]:
path ="/mnt/vdb/DECockroach/pws/transpi/result/result_9/result_BILSTM"
if not os.path.exists(path):
    os.makedirs(path)

existing_result = []
for infile in glob.glob(path+"/*.pkl"):
    # print(infile)
    existing_result.append(infile)
print(len(existing_result))
predict_CNN("/mnt/vdb/DECockroach/pws/transpi/reps/*.pkl",path,existing_result)
# write result
result_df =mergeDF(path+"/*.pkl",path,"DECockroach.len15.MLResult.plk")
result_df

0
Complete
Save: /mnt/vdb/DECockroach/pws/transpi/result/result_9/result_BILSTM/DECockroach.len15.MLResult.plk


Unnamed: 0,ID,Sequence,length,class
0,SOAP.k25.C372231.p1,VYYRRDGKGDKEYWTCQKKPECKATAITIRTGDTVTILKESDHWHA...,102,0
1,SOAP.k25.C373809.p1,KTRLTVVGTKVINEKNNVKLKGVSKVVSLHVYRLAPDTTIEELTEY...,104,0
2,SOAP.k25.C379695.p1,MLRDYREIGNLVLCFDTPFTVDFKVIQDAALQKELIEFRCDRRLRE...,95,0
3,SOAP.k25.C382451.p1,PSPCGANAVCREQNGAGSCTCLPDYVGNPYEGCRPECVLNTDCPSN...,116,0
4,SOAP.k25.C383053.p1,FKMLTMPRRDICQIETLNLADPLMFLVRNRVCTSTMFHLLPFSYTS...,104,0
...,...,...,...,...
637,Velvet.k37.NODE_9552_length_1035_cov_12.333333.p1,EPKLVNEVNLTFHEKDGEEFMALDKNLKVTTTVKRVYMHLTNLFNG...,102,0
638,Velvet.k37.NODE_9713_length_2207_cov_17.057997.p1,MIRRWWKLFMFIMAMLLDVREAFYVPGVAPVEFRKGARIDVKAVKM...,627,0
639,Velvet.k37.NODE_9748_length_894_cov_40.512302.p1,MLNFSHHVSKTIRKKKSSKITGRFSRYKKMRTCSSLNEIYIVYIYI...,124,0
640,Velvet.k37.NODE_9800_length_726_cov_45.530304.p1,MASFEQAIQQNVMQVAKKVEEHLDAELEKLEKLDSDDLDKLREKRL...,221,0
