In [31]:
import os, warnings
warnings.filterwarnings('ignore')
# NVIDIA SETTINGS 
# Please configure according to the situation of your own device
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['CUDA_VISIBLE_DEVICES'] = '2'

import tensorflow as tf
import gc

gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

import pandas as pd
import numpy as np

from joblib import load, dump

def r2_score(y_true,y_pred):
    y_mean = np.mean(y_true)
    r2 = 1-sum((y_true-y_pred)**2)/sum((y_mean-y_true)**2)
    return r2

def PCC(y_pred,y_true):
    diff_pred,diff_true=y_pred-np.mean(y_pred),y_true-np.mean(y_true)
    return np.sum(diff_pred*diff_true)/np.sqrt(np.sum(diff_pred**2)*np.sum(diff_true**2))

def to_ic50(x, max_ic50=50000.0):
    return max_ic50 ** (1.0 - x)

sample_weight=None,
threshold_nm=500,
max_ic50=50000

In [32]:
blosum62 = pd.read_csv('../blosum_pam_data/BLOSUM62.txt', sep='\s')
blosum62 = blosum62.iloc[:-4,:-4]

pam250 = pd.read_csv('../blosum_pam_data/PAM250.csv',index_col=0)

In [33]:
PR_31 = pd.read_csv('../PP_PR_vector/PR_31.csv', index_col='properties').index
PR_53 = pd.read_csv('../PP_PR_vector/PR_53.csv', index_col='properties').index
PP_dic = pd.read_csv('../PP_PR_vector/PP_740.csv', header=0,index_col=0)
PR_31_vectors = PP_dic.loc[PR_31].astype('float')
PR_53_vectors = PP_dic.loc[PR_53].astype('float')
PR_vector_dic = {
    'PR_31_vectors': PR_31_vectors,
    'PR_53_vectors': PR_53_vectors
}
PR_vectors = None
for PR_name, PR in PR_vector_dic.items():
    PR = PR.T
    PR = ((PR - PR.min()) / (PR.max() - PR.min())).T
    PR_vector_dic[PR_name] = PR

In [34]:
def get_3d_feat(seq):
    n = len(seq)
    seq_pro = pd.DataFrame(PR_vectors[aa] for aa in seq).T
    seq_pro = seq_pro.values[:, :, None]  #Two dimensions become three dimensions (31, 20, 1)
    #x_i = np.transpose((seq_pro *np.transpose(seq_pro, [0, 2, 1])), [1, 2, 0])
    #Multiply to the root
    x_i = np.transpose((seq_pro * np.transpose(seq_pro, [0, 2, 1])), [1, 2, 0]) **.25
    #Transpose three-dimensional transposition. (31, 20, 1) *(31, 1, 20) = (31, 20, 20), and then transposed into (20, 20, 31).
    for k in range(x_i.shape[2]):
        for i in range(n):
            for j in range(i):
                if k < 30:
                    x_i[i,j,k] = (x_i[j,i,k] * x_i[j,i,k+1])**.5
                else:
                    x_i[i,j,k] = (x_i[j,i,k] * x_i[j,i,0])**.5
    
    x = np.pad(x_i, [(0, max_seq_len-n), (0, max_seq_len-n), (0, 0)]) #Fill with 0 Make the xshape corresponding to each sequence in the data set the same.

    return x[ :, :, :, None]

In [35]:
allele_ls = [
    'HLA-B_0702',
    'HLA-C_0401',
    'HLA-C_0702'
]

In [38]:
Test_dir = '../../processed_data/mhcflurry_test_no_mass_spec/data'

middle_data_dir = 'PP_diversity_influence/middle_data'

if not os.path.exists(middle_data_dir) : 
    os.makedirs(middle_data_dir)
for PR_name, PR in PR_vector_dic.items():
    PR_vectors = PR
    for Allele in allele_ls:
        tf_model_dir = os.path.join(f'PP_diversity_influence/tf_model/{PR_name}', Allele)
        if not os.path.exists(tf_model_dir): 
            continue
        if not os.path.exists(f'{Test_dir}/{Allele}.csv'): 
            continue
        df_test = pd.read_csv(f'{Test_dir}/{Allele}.csv')
        
        print(f'*************{PR_name} {Allele} predict start**************')
        
        #Individual alleles may prompt the use of other numerical values,
        #please adjust `max_seq_len`` according to the prompt information,
        #and delete the middle data of the corresponding allele to regenerate it
    
        max_seq_len = 15
    
        #Generate X_test middle data
        X_test_name = os.path.join(middle_data_dir, PR_name, f'{Allele}_X_test_'+'.data')
        if not os.path.exists(X_test_name) :
            X_test = []
            for seq in df_test['peptide']:
                X_test.append(get_3d_feat(seq))
            X_test = np.stack(X_test)
            dump(X_test, X_test_name)
        else:
            X_test = load(X_test_name)
        X_test = X_test.astype('float32')

        model = tf.keras.models.load_model(f'{tf_model_dir}')
        
        #make prediction
        Y_test_pred = model.predict(X_test)
        df_pred = pd.DataFrame(Y_test_pred.tolist()).rename(columns={0:'VRAPERNet_BAV_Normalized'})
        df_test['VRAPERNet_BAV_Normalized'] = df_pred['VRAPERNet_BAV_Normalized']
        df_test['VRAPERNet_BAV'] = df_pred['VRAPERNet_BAV_Normalized'].apply(to_ic50)
        df_test.to_csv(f'PP_diversity_influence/prediction/{PR_name}/{Allele}.csv', index=False)
        
        del model

        print(f'*************{PR_name} {Allele} predict finished**************')
        
gc.collect()


*************PR_31_vectors HLA-B_0702 predict start**************
*************PR_31_vectors HLA-B_0702 predict finished**************
*************PR_31_vectors HLA-C_0401 predict start**************
*************PR_31_vectors HLA-C_0401 predict finished**************
*************PR_31_vectors HLA-C_0702 predict start**************
*************PR_31_vectors HLA-C_0702 predict finished**************
*************PR_53_vectors HLA-B_0702 predict start**************
*************PR_53_vectors HLA-B_0702 predict finished**************
*************PR_53_vectors HLA-C_0401 predict start**************
*************PR_53_vectors HLA-C_0401 predict finished**************
*************PR_53_vectors HLA-C_0702 predict start**************
*************PR_53_vectors HLA-C_0702 predict finished**************


27383