In [13]:
import os, warnings
warnings.filterwarnings('ignore')
# NVIDIA SETTINGS 
# Please configure according to the situation of your own device
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import tensorflow as tf
import gc

gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

import pandas as pd
import numpy as np

from joblib import load, dump

def r2_score(y_true,y_pred):
    y_mean = np.mean(y_true)
    r2 = 1-sum((y_true-y_pred)**2)/sum((y_mean-y_true)**2)
    return r2

def PCC(y_pred,y_true):
    diff_pred,diff_true=y_pred-np.mean(y_pred),y_true-np.mean(y_true)
    return np.sum(diff_pred*diff_true)/np.sqrt(np.sum(diff_pred**2)*np.sum(diff_true**2))

def to_ic50(x, max_ic50=50000.0):
    return max_ic50 ** (1.0 - x)

sample_weight=None,
threshold_nm=500,
max_ic50=50000

In [14]:
blosum62 = pd.read_csv('../blosum_pam_data/BLOSUM62.txt', sep='\s')
blosum62 = blosum62.iloc[:-4,:-4]

pam250 = pd.read_csv('../blosum_pam_data/PAM250.csv',index_col=0)

In [15]:
PR = pd.read_csv('../PP_PR_vector/PR_61.csv', index_col='properties').index
PP_dic = pd.read_csv('../PP_PR_vector/PP_740.csv', header=0,index_col=0)
PR_vectors = PP_dic.loc[PR].astype('float')
PR_vectors = PR_vectors.T
PR_vectors = ((PR_vectors - PR_vectors.min()) / (PR_vectors.max() - PR_vectors.min())).T 

In [16]:
def get_3d_feat(seq):
    n = len(seq)

    #Generate physical and chemical property matrix
    seq_pro = pd.DataFrame(PR_vectors[aa] for aa in seq).T
    seq_pro = seq_pro.values[:, :, None]  #Two dimensions become three dimensions (61, n, 1)
    #Multiply
    mt_pro = np.transpose((seq_pro * np.transpose(seq_pro, [0, 2, 1])), [1, 2, 0])
    #transpose three-dimensional transposition. (61,n,1)*(61,1,n)=(61,n,n), and then transpose to (n,n,61).
    
    #Generate blosum matrix
    seq_blosum = np.ones((n,n))
    for i in range(n):
        a = seq[i]
        for j in range(n):
            b = seq[j]
            seq_blosum[i][j] = blosum62.loc[a,b]
    seq_blosum_nor = ((seq_blosum - seq_blosum.min()) / (seq_blosum.max() - seq_blosum.min()))
    seq_blosum_nor = seq_blosum_nor[:, :, np.newaxis]  #(n,n,1)
    
    #Generate pam matrix
    seq_pam = np.ones((n,n))
    for i in range(n):
        a = seq[i]
        for j in range(n):
            b = seq[j]
            seq_pam[i][j] = pam250.loc[a,b]
    seq_pam_nor = ((seq_pam - seq_pam.min()) / (seq_pam.max() - seq_pam.min()))
    seq_pam_nor = seq_pam_nor[:, :, np.newaxis]  #(n,n,1)

    #Generate sequence distance matrix
    pt_dis = np.ones((n,n))
    for i in range(n):
        for j in range(n):
            pt_dis[i][j] = abs(i-j)
    pt_dis = ((pt_dis - 0) / (n-1 - 0))
    pt_dis = pt_dis[:, :, np.newaxis]

    
    for k in range(mt_pro.shape[2]):
        for i in range(n):
            for j in range(i):
                if k < 60:                                      
                    mt_pro[i,j,k] = (mt_pro[j,i,k] * mt_pro[j,i,k+1])**.5
                else:
                    mt_pro[i,j,k] = (mt_pro[j,i,k] * mt_pro[j,i,0])**.5

    #Merge the sequence distance matrix with mt and place it on the first layer
    mt = np.concatenate((pt_dis,seq_blosum_nor,seq_pam_nor,mt_pro),axis = 2)

    #Fill with 0 Make the xshape corresponding to each sequence in the data set the same.
    x = np.pad(mt, [(0, max_seq_len-n), (0, max_seq_len-n), (0, 0)]) 

    return x[ :, :, :, None]

Model Structure

In [17]:
class Inception(tf.keras.layers.Layer):

    def __init__(self, units = 8, strides = 1):
        super(Inception, self).__init__()
        self.conv1 = tf.keras.layers.Conv3D(units, (1,1,1), padding='same', activation = 'relu', strides = strides,kernel_regularizer=tf.keras.regularizers.l2(lamda))
        self.conv2 = tf.keras.layers.Conv3D(units, (3,3,3), padding='same', activation = 'relu', strides = strides,kernel_regularizer=tf.keras.regularizers.l2(lamda))
        self.conv3 = tf.keras.layers.Conv3D(units, (5,5,5), padding='same', activation = 'relu', strides = strides,kernel_regularizer=tf.keras.regularizers.l2(lamda))
        self.concat = tf.keras.layers.Concatenate()

    def call(self, inputs):
        x1 = self.conv1(inputs)
        x2 = self.conv2(inputs)
        x3 = self.conv3(inputs)
        outputs = self.concat([x1, x2, x3])
        return outputs

    def get_config(self): 
        config = {"conv1": self.conv1,"conv2":self.conv2,'conv3':self.conv3}
        base_config = super(Inception, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [18]:
class Model_1(tf.keras.Model):
    def __init__(self):
            super().__init__()
            
            self.Cov_1 = tf.keras.layers.Conv3D(kn_1, ks_fir, activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_1 = tf.keras.layers.MaxPool3D() 
            self.Cov_2 =Inception(units=kn_2, strides=1)
            self.MaxPool_2 = tf.keras.layers.MaxPool3D()
            self.Cov_3 = Inception(units=kn_3, strides=1)
            self.GlobalMaxPool = tf.keras.layers.GlobalMaxPooling3D() 
            self.Dense_1 = tf.keras.layers.Dense(128, activation='relu')
            self.Dense_2 =tf.keras.layers.Dense(32, activation='relu')
            self.Dense_3 =tf.keras.layers.Dense(1)
    
    def call(self, input):
            x = self.Cov_1(input)
            x = self.MaxPool_1(x)
            x = self.Cov_2(x) 
            x = self.MaxPool_2(x)
            x = self.Cov_3(x)
            x = self.GlobalMaxPool(x)
            x = self.Dense_1(x)
            x = self.Dense_2(x)
            x = self.Dense_3(x)
            return x

In [19]:
class Model_2(tf.keras.Model):
    def __init__(self):
            super().__init__()
            
            self.Cov_1 = tf.keras.layers.Conv3D(kn_1, ks_fir, activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_1 = tf.keras.layers.MaxPool3D()
            self.Cov_2 =Inception(units=kn_2, strides=1)
            self.MaxPool_2 = tf.keras.layers.MaxPool3D()
            self.Cov_3 = Inception(units=kn_3, strides=1)
            self.GlobalMaxPool = tf.keras.layers.GlobalMaxPooling3D()
            self.Dense_1 = tf.keras.layers.Dense(512, activation='relu')
            self.Dense_2 =tf.keras.layers.Dense(64, activation='relu')
            self.Dense_3 =tf.keras.layers.Dense(1)
            
    
    def call(self, input):
            x = self.Cov_1(input)
            x = self.MaxPool_1(x)
            x = self.Cov_2(x) 
            x = self.MaxPool_2(x)
            x = self.Cov_3(x)
            x = self.GlobalMaxPool(x)
            x = self.Dense_1(x)
            x = self.Dense_2(x)
            x = self.Dense_3(x)
            return x

In [20]:
class Model_3(tf.keras.Model):
    def __init__(self):
            super().__init__()
            
            self.Cov_1 = tf.keras.layers.Conv3D(kn_1, ks_fir, activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_1 = tf.keras.layers.MaxPool3D()
            self.Cov_2 =Inception(units=kn_2, strides=1)
            # self.MaxPool_2 = tf.keras.layers.MaxPool3D()
            # self.Cov_3 = Inception(units=kn_3, strides=1)
            self.GlobalMaxPool = tf.keras.layers.GlobalMaxPooling3D()
            self.Dense_1 = tf.keras.layers.Dense(512, activation='relu')
            self.Dense_2 =tf.keras.layers.Dense(64, activation='relu')
            self.Dense_3 =tf.keras.layers.Dense(1)
            
    
    def call(self, input):
            x = self.Cov_1(input)
            x = self.MaxPool_1(x)
            x = self.Cov_2(x) 
            # x = self.MaxPool_2(x)
            # x = self.Cov_3(x)
            x = self.GlobalMaxPool(x)
            x = self.Dense_1(x)
            x = self.Dense_2(x)
            x = self.Dense_3(x)
            return x

In [21]:
class Model_5(tf.keras.Model):
    def __init__(self):
            super().__init__()
            
            self.Cov_1 = tf.keras.layers.Conv3D(kn_1, ks_fir, activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_1 = tf.keras.layers.MaxPool3D() 
            
            self.Cov_2 = tf.keras.layers.Conv3D(kn_1, (5,5,5), activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_2 = tf.keras.layers.MaxPool3D() 
            
            self.Cov_3 =Inception(units=kn_2, strides=1)
            self.MaxPool_3 = tf.keras.layers.MaxPool3D()
            self.Cov_4 = Inception(units=kn_3, strides=1)
            self.GlobalMaxPool = tf.keras.layers.GlobalMaxPooling3D()         
            self.Dense_1 = tf.keras.layers.Dense(128, activation='relu')
            self.Dense_2 =tf.keras.layers.Dense(32, activation='relu')
            self.Dense_3 =tf.keras.layers.Dense(1)
            
    
    def call(self, input):
            x = self.Cov_1(input)
            x = self.MaxPool_1(x)
            
            x = self.Cov_2(x)
            x = self.MaxPool_2(x)
            
            x = self.Cov_3(x) 
            x = self.MaxPool_3(x)
            x = self.Cov_4(x)
            x = self.GlobalMaxPool(x)
            x = self.Dense_1(x)
            x = self.Dense_2(x)
            x = self.Dense_3(x)
            return x

In [22]:
class Model_6(tf.keras.Model):
    def __init__(self):
            super().__init__()
            
            self.Cov_1 = tf.keras.layers.Conv3D(kn_1, ks_fir, activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_1 = tf.keras.layers.MaxPool3D() 
    #         
    #         tf.keras.layers.Conv3D(kn_1, (5,5,5), activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda),
    # bias_regularizer=tf.keras.regularizers.l1(lamda), activity_regularizer=tf.keras.regularizers.l2(lamda)),
    #         tf.keras.layers.MaxPool3D(), 
    #         
            self.Cov_2 =Inception(units=kn_2, strides=1)
            # self.MaxPool_2 = tf.keras.layers.MaxPool3D()
            # self.Cov_3 = Inception(units=kn_3, strides=1)
            self.GlobalMaxPool = tf.keras.layers.GlobalMaxPooling3D()         
            self.Dense_1 = tf.keras.layers.Dense(128, activation='relu')
            self.Dense_2 =tf.keras.layers.Dense(32, activation='relu')
            self.Dense_3 =tf.keras.layers.Dense(1)
            
    
    def call(self, input):
            x = self.Cov_1(input)
            x = self.MaxPool_1(x)
            x = self.Cov_2(x) 
            # x = self.MaxPool_2(x)
            # x = self.Cov_3(x)
            x = self.GlobalMaxPool(x)
            x = self.Dense_1(x)
            x = self.Dense_2(x)
            x = self.Dense_3(x)
            return x

In [23]:
class Model_7(tf.keras.Model):
    def __init__(self):
            super().__init__()
            
            self.Cov_1 = tf.keras.layers.Conv3D(kn_1, ks_fir, activation='relu', padding='same',kernel_regularizer=tf.keras.regularizers.l2(lamda))
            self.MaxPool_1 = tf.keras.layers.MaxPool3D() 

            self.Cov_2 =Inception(units=kn_2, strides=1)
            self.MaxPool_2 = tf.keras.layers.MaxPool3D()
            self.Cov_3 = Inception(units=kn_3, strides=1)
            self.GlobalMaxPool = tf.keras.layers.GlobalMaxPooling3D()         
            self.Dense_1 = tf.keras.layers.Dense(64, activation='relu')
            self.Dense_2 =tf.keras.layers.Dense(16, activation='relu')
            self.Dense_3 =tf.keras.layers.Dense(1)
    
    def call(self, input):
            x = self.Cov_1(input)
            x = self.MaxPool_1(x)
            x = self.Cov_2(x) 
            x = self.MaxPool_2(x)
            x = self.Cov_3(x)
            x = self.GlobalMaxPool(x)
            x = self.Dense_1(x)
            x = self.Dense_2(x)
            x = self.Dense_3(x)
            return x

In [32]:
Test_dir = '../../processed_data/mhcflurry_test_no_mass_spec/data'

middle_data_dir = 'prediction_mhcflurry_test_no_mass_spec/middle_data'

if not os.path.exists(middle_data_dir) : 
    os.makedirs(middle_data_dir)

for Allele in os.listdir('tf_model'):
    
    model_info_dir = os.path.join('model_info' ,Allele)
    tf_model_dir = os.path.join('tf_model' ,Allele)

    if not os.path.exists(model_info_dir): 
        continue
    if not os.path.exists(tf_model_dir): 
        continue
    if not os.path.exists(f'{Test_dir}/{Allele}.csv'): 
        continue
    df_test = pd.read_csv(f'{Test_dir}/{Allele}.csv')
    
    print(f'*************{Allele} predict start**************')
    
    #Individual alleles may prompt the use of other numerical values,
    #please adjust `max_seq_len`` according to the prompt information,
    #and delete the middle data of the corresponding allele to regenerate it
    max_seq_len = 15

    #Generate X_test middle data
    X_test_name = os.path.join(middle_data_dir, f'{Allele}_X_test_'+'.data')
    if not os.path.exists(X_test_name) :
        X_test = []
        for seq in df_test['peptide']:
            X_test.append(get_3d_feat(seq))
        X_test = np.stack(X_test)
        dump(X_test, X_test_name)
    else:
        X_test = load(X_test_name)
    X_test = X_test.astype('float32')

    #read the corresponding params
    df_ref = pd.read_csv(f'{model_info_dir}/{Allele}_model_info.csv')
    inc = df_ref['kernel_size_incept'][0]
    bs = df_ref['batch_size'][0]
    lr = df_ref['lr'][0]
    ks_fir = df_ref['kernel_size_1'][0]
    ks_fir = eval(ks_fir)
    kn = df_ref['kernel_number'][0]
    kn = eval(kn)
    kn_1,kn_2,kn_3 = kn[0],kn[1],kn[2]
    lamda = df_ref['lamda'][0]
    md = df_ref['model'][0]
    
    model_index = md.split('_')[1]
    model = eval('Model_' + model_index+'()')

    X = load(X_test_name)
    model.build(X.shape)
    model.load_weights(f'{tf_model_dir}/{Allele}_model_weights.h5')
    
    #make prediction
    Y_test_pred = model.predict(X_test)
    df_pred = pd.DataFrame(Y_test_pred.tolist()).rename(columns={0:'VRAPERNet_BAV_Normalized'})
    df_test['VRAPERNet_BAV_Normalized'] = df_pred['VRAPERNet_BAV_Normalized']
    df_test['VRAPERNet_BAV'] = df_pred['VRAPERNet_BAV_Normalized'].apply(to_ic50)
    df_test.to_csv(f'prediction_mhcflurry_test_no_mass_spec/prediction/{Allele}.csv', index=False)
    
    del model

    print(f'*************{Allele} predict finished**************')
        
gc.collect()


*************HLA-B_2705 predict start**************
*************HLA-B_2705 predict finished**************
*************HLA-B_1517 predict start**************
*************HLA-B_1517 predict finished**************
*************HLA-B_2702 predict start**************
*************HLA-B_2702 predict finished**************
*************HLA-B_0802 predict start**************
*************HLA-B_0802 predict finished**************
*************HLA-A_0212 predict start**************
*************HLA-A_0212 predict finished**************
*************HLA-A_1101 predict start**************
*************HLA-A_1101 predict finished**************
*************HLA-B_2703 predict start**************
*************HLA-B_2703 predict finished**************
*************HLA-B_2704 predict start**************
*************HLA-B_2704 predict finished**************
*************HLA-C_1203 predict start**************
*************HLA-C_1203 predict finished**************
*************HLA-A_3001 predict start

6696