In [4]:
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rnd
import math, os, time, sys, re, datetime
from datetime import timedelta
from sklearn.metrics import roc_auc_score, roc_curve, auc
from scipy import stats

print("Tensorflow version " + tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

#testset = 'A0202'
#INPUTS: python DLMHC.py 0 A0202
#testset = sys.argv[2] # e.g. takes A0202 as input allele name
testset = "A0101"

print("Test set is ", testset)
runindx = 0 

###all the possible sequence letters
allSequences = 'ACEDGFIHKMLNQPSRTWVYZ'
# Establish a mapping from letters to integers
char2int = dict((c, i) for i, c in enumerate(allSequences))

Tensorflow version 2.7.0
Num GPUs Available:  1
Test set is  A0101


In [9]:
def getdata_onehot(datafile):   #build testing matrix
    ### READ in test dataset
    """ Reads the test data file and extracts allele subtype,
            peptide length, and measurement type. Returns these information
            along with the peptide sequence and target values.
    """
    #train数据载入
    import os
    traindata = os.path.join("./../DATA", "train_data", datafile )
    print("traindata name: ", traindata)
    df_train = pd.read_csv(traindata, header=0)
    df_train = df_train[df_train.Peptide.str.contains('X') == False]
    df_train = df_train[df_train.Peptide.str.contains('B') == False]
    df_train = df_train[df_train.Peptide.str.contains('U') == False]
    #eg.df_train = pd.read_csv('./DATA/train_data/A0202',sep="\t")
    
    #下采样
    new_df_0 = df_train.loc[df_train['BindingCategory']== 0].sample(frac = 1)
    #上采样
    df_1_list = []
    for i in range(4):
        df_1_list.append(df_train.loc[df_train['BindingCategory']== 1])
        new_df_1 = pd.concat(df_1_list)
    new_df_train = pd.concat([new_df_0,new_df_1])
    new_df_train = new_df_train.sample(frac = 1.0) #shuffle


    #X_train--补齐11mer--one_hot_matrix
    train_data=transformEL(new_df_train)
    trainMatrix = np.empty((0, 11,len(allSequences)), int)      
    for num in range(len(train_data.Peptide)):
        if num%1000 == 0:
            print(train_data.Peptide.iloc[num],num)
        trainMatrix = np.append(trainMatrix, [Pept_OneHotMap(train_data.Peptide.iloc[num])], axis=0)
    allele_name = train_data['HLA'][0]
    assert (trainMatrix.shape[0] == train_data.shape[0])

    #test数据载入
    testdata = os.path.join("./../DATA", "test_data", datafile )
    df_test = pd.read_csv(testdata, header=0)
    df_test = df_test[df_test.Peptide.str.contains('X') == False]
    df_test = df_test[df_test.Peptide.str.contains('B') == False]
    df_test = df_test[df_test.Peptide.str.contains('U') == False]
    #eg.df_test = pd.read_csv('./DATA/test_data/A0202',sep="\t")

    #X_test--补齐11mer--one_hot_matrix
    test_data=transformEL(df_test)
    testMatrix = np.empty((0, 11,len(allSequences)), int)      
    for num in range(len(test_data.Peptide)):
        if num%1000 == 0:
            print(test_data.Peptide.iloc[num],num)
        testMatrix = np.append(testMatrix, [Pept_OneHotMap(test_data.Peptide.iloc[num])], axis=0)
    assert (testMatrix.shape[0] == test_data.shape[0])

    Y_train = train_data.BindingCategory
    Y_test = test_data.BindingCategory 
    #
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    #
    trainlen = len(trainMatrix)
    ss1 = list(range(trainlen))
    rnd.shuffle(ss1)    #
    
    # combine training and test datasets
    datasets={}
    datasets['X_train'] = trainMatrix
    datasets['Y_train'] = Y_train.values #traindata.BindingCategory.as_matrix()
    datasets['X_test'] = testMatrix
    datasets['Y_test'] = Y_test.values    

    return datasets

def Pept_OneHotMap(peptideSeq):
    """ maps amino acid into its numerical index
    USAGE
    Pept_OneHotMap('A')
    array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    """
    # integer encode input data
    integer_encoded=[char2int[char] for char in peptideSeq]
    # one hot encode
    onehot_encoded = list()
    for value in integer_encoded:
    	letter = [0 for _ in range(len(allSequences))]
    	letter[value] = 1
    	onehot_encoded.append(letter)
    return np.asarray(onehot_encoded)

def transformEL(dataset):
    dataset = dataset.reset_index(drop=True)
    peptide=dataset.Peptide
    peptide2list=peptide.tolist()
    for i in range(len(peptide)):
        if len(peptide2list[i]) < 11:
            n1 = len(peptide2list[i]) // 2
            n2 = 11 - len(peptide2list[i])
            peptide2list[i] = peptide2list[i][:n1] + 'Z'*n2 + peptide2list[i][n1:]     #将小于11个氨基酸的peptide在中间插空补齐
        else:
            peptide2list[i] = peptide2list[i][:11]

    del dataset['Peptide']
    peptides = pd.DataFrame(peptide2list,columns=['Peptide'])
    dataset.insert(0,'Peptide',peptides.pop('Peptide'))
    return dataset

In [10]:
data=getdata_onehot(datafile=testset)

traindata name:  ./../DATA/train_data/A0101
ASDLZZSQPDL 0
KQTWZZETCYA 1000
KPAPZZSGLPS 2000
AVSIDZVTQGD 3000
RDASZZRLEPS 4000
LVDDSZEDPGA 5000
WPTMZZENLLQ 6000
IANAZZISVVS 7000
PPIGDVEYLTA 8000
MLHEZZLDGLI 9000
IADSNZYNWFY 10000
FCFGSSKQRAF 11000
LRDQZZAAALA 12000
ILDEZZRHDNY 13000
VSVGIZKCDAR 14000
YTTGZZHWDNY 15000
IADSNZYNWFY 16000
GLYFZZERRRP 17000
LQQPWZDPQMP 18000
SPPQPZPNDHS 19000
DIPRELIGKPL 20000
MPRAQSYPDNH 21000
RWELEZRLEEE 22000
KSDGZZSFIGY 23000
GLDLSQAAART 24000
YLGGKZQYKCD 25000
ISDAAQLPHDY 26000
VGPVZZPPKPK 27000
PNPSRZSPCLP 28000
SVDHRZGTWNG 29000
TIKVEZKPTMQ 30000
YFGGEARCDAE 31000
STNGNYDGVLY 32000
ETNLVZGSDKY 33000
ILLQZZHEATP 34000
ETEESZNLNMY 35000
SLVPPAAGSKQ 36000
GSGLYZDADSE 37000
ATEQAZPLWAY 38000
VFKVAELSGNR 39000
ISKIZZESEAF 40000
PGKPGZYGSPG 41000
LWLVSPLLEVQ 42000
QYGEZZVANLL 43000
IGLNMZLFGPK 44000
SEFPFVSLKEP 45000
PVPGPZNGTIL 46000
DFGNZZSPLHR 47000
PDKTVIEYEYD 48000
FIDRDGPLFRY 49000
NRGDRPPPPVL 50000
KVSNVZEGILA 51000
IYRCFASNKLG 52000
VKMGZZERVRI 530

NameError: name 'rnd' is not defined

In [21]:
def binary2onehot(yy):
    yy2= np.zeros((len(yy),2), dtype=int) #yy2.shape #(10547, 2)
    for num in range(len(yy)):
        if yy[num]==1:
            yy2[num,0]=1
        else:
            yy2[num,1]=1
    return yy2

NameError: name 'data' is not defined

In [None]:
shuffle_ = np.arange(len(data['Y_train']))
np.random.shuffle(shuffle_)
data['Y_train']=data['Y_train'][shuffle_]
data['X_train']=data['X_train'][shuffle_]

print("X_Train size ", data['X_train'].shape)
print("Y_Train size ", data['Y_train'].shape)
print("Train data value=1 ", np.sum(data['Y_train']==1))
print("X_Test size " , data['X_test'].shape)
print("Y_Test size " , data['Y_test'].shape)
print("Test data value=1 ", np.sum(data['Y_test']==1))


Y_train_labels = data_process.binary2onehot(data['Y_train']) # binary output converted into two classes
Y_test_labels = data_process.binary2onehot(data['Y_test'])
X_train_data = data['X_train']  #already one hot encoded
X_test_data = data['X_test']