In [18]:
from __future__ import division, print_function, absolute_import
import tensorflow as tf
import numpy as np
import math, os, time, sys, re, datetime
from datetime import timedelta
from sklearn.metrics import roc_auc_score, roc_curve, auc
from scipy import stats

print("Tensorflow version " + tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

#testset = 'A0202'
#INPUTS: python DLMHC.py 0 A0202
#testset = sys.argv[2] # e.g. takes A0202 as input allele name
testset = "A0101"

print("Test set is ", testset)
runindx = 0 

###all the possible sequence letters
allSequences = 'ACEDGFIHKMLNQPSRTWVYZ'
# Establish a mapping from letters to integers
char2int = dict((c, i) for i, c in enumerate(allSequences))

Tensorflow version 2.7.0
Num GPUs Available:  1
Test set is  A0101


In [19]:
def getdata_onehot(datafile):   #build testing matrix
    ### READ in test dataset
    """ Reads the test data file and extracts allele subtype,
            peptide length, and measurement type. Returns these information
            along with the peptide sequence and target values.
    """
    #train数据载入
    import os
    traindata = os.path.join("./../DATA", "train_data", datafile )
    print("traindata name: ", traindata)
    df_train = pd.read_csv(traindata, header=0)
    df_train = df_train[df_train.Peptide.str.contains('X') == False]
    df_train = df_train[df_train.Peptide.str.contains('B') == False]
    df_train = df_train[df_train.Peptide.str.contains('U') == False]
    #eg.df_train = pd.read_csv('./DATA/train_data/A0202',sep="\t")
    
    #下采样
    new_df_0 = df_train.loc[df_train['BindingCategory']== 0].sample(frac = 1)
    #上采样
    df_1_list = []
    for i in range(4):
        df_1_list.append(df_train.loc[df_train['BindingCategory']== 1])
        new_df_1 = pd.concat(df_1_list)
    new_df_train = pd.concat([new_df_0,new_df_1])
    new_df_train = new_df_train.sample(frac = 1.0) #shuffle


    #X_train--补齐11mer--one_hot_matrix
    train_data=transformEL(new_df_train)
    trainMatrix = np.empty((0, 11,len(allSequences)), int)      
    for num in range(len(train_data.Peptide)):
        if num%1000 == 0:
            print(train_data.Peptide.iloc[num],num)
        trainMatrix = np.append(trainMatrix, [Pept_OneHotMap(train_data.Peptide.iloc[num])], axis=0)
    allele_name = train_data['HLA'][0]
    assert (trainMatrix.shape[0] == train_data.shape[0])

    #test数据载入
    testdata = os.path.join("./../DATA", "test_data", datafile )
    df_test = pd.read_csv(testdata, header=0)
    df_test = df_test[df_test.Peptide.str.contains('X') == False]
    df_test = df_test[df_test.Peptide.str.contains('B') == False]
    df_test = df_test[df_test.Peptide.str.contains('U') == False]
    #eg.df_test = pd.read_csv('./DATA/test_data/A0202',sep="\t")

    #X_test--补齐11mer--one_hot_matrix
    test_data=transformEL(df_test)
    testMatrix = np.empty((0, 11,len(allSequences)), int)      
    for num in range(len(test_data.Peptide)):
        if num%1000 == 0:
            print(test_data.Peptide.iloc[num],num)
        testMatrix = np.append(testMatrix, [Pept_OneHotMap(test_data.Peptide.iloc[num])], axis=0)
    assert (testMatrix.shape[0] == test_data.shape[0])

    Y_train = train_data.BindingCategory
    Y_test = test_data.BindingCategory 
    #
    Y_train = Y_train.reset_index(drop=True)
    Y_test = Y_test.reset_index(drop=True)
    #
    trainlen = len(trainMatrix)
    ss1 = list(range(trainlen))
    rnd.shuffle(ss1)
    #
    valsize= int(1000) #Validation set size is 100 for validations dataset
    X_val1 = trainMatrix[ss1[0:valsize]]
    Y_val1 = Y_train.iloc[ss1[0:valsize]]
    X_val2 = trainMatrix[ss1[valsize:(2*valsize)]]
    Y_val2 = Y_train.iloc[ss1[valsize:(2*valsize)]]
    X_val3 = trainMatrix[ss1[(2*valsize):(3*valsize)]]
    Y_val3 = Y_train.iloc[ss1[(2*valsize):(3*valsize)]]    

    trainMatrix = np.delete(trainMatrix,ss1[0:(3*valsize)], axis=0)
    Y_train = Y_train.drop(Y_train.index[ss1[0:(3*valsize)]])
    # combine training and test datasets
    datasets={}
    datasets['X_train'] = trainMatrix
    datasets['Y_train'] = Y_train.values #traindata.BindingCategory.as_matrix()
    datasets['X_test'] = testMatrix
    datasets['Y_test'] = Y_test.values
    datasets['X_val1'] = X_val1
    datasets['Y_val1'] = Y_val1.values
    datasets['X_val2'] = X_val2
    datasets['Y_val2'] = Y_val2.values
    datasets['X_val3'] = X_val3
    datasets['Y_val3'] = Y_val3.values

    return datasets

def Pept_OneHotMap(peptideSeq):
    """ maps amino acid into its numerical index
    USAGE
    Pept_OneHotMap('A')
    array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    """
    # integer encode input data
    integer_encoded=[char2int[char] for char in peptideSeq]
    # one hot encode
    onehot_encoded = list()
    for value in integer_encoded:
    	letter = [0 for _ in range(len(allSequences))]
    	letter[value] = 1
    	onehot_encoded.append(letter)
    return np.asarray(onehot_encoded)

def transformEL(dataset):
    dataset = dataset.reset_index(drop=True)
    peptide=dataset.Peptide
    peptide2list=peptide.tolist()
    for i in range(len(peptide)):
        if len(peptide2list[i]) < 11:
            n1 = len(peptide2list[i]) // 2
            n2 = 11 - len(peptide2list[i])
            peptide2list[i] = peptide2list[i][:n1] + 'Z'*n2 + peptide2list[i][n1:]     #将小于11个氨基酸的peptide在中间插空补齐
        else:
            peptide2list[i] = peptide2list[i][:11]

    del dataset['Peptide']
    peptides = pd.DataFrame(peptide2list,columns=['Peptide'])
    dataset.insert(0,'Peptide',peptides.pop('Peptide'))
    return dataset

In [20]:
data=getdata_onehot(datafile=testset)

traindata name:  ./../DATA/train_data/A0101
RENATLTELRV 0
FLDAZZFRTFY 1000
LGSMZZTQKSD 2000
PGDPZZRGGRE 3000
EEQHZZEVSVN 4000
QTDSKMLLGMY 5000
DSSGLZLSCSE 6000
LCRKZZELRQI 7000
LLDTZZTEKYL 8000
KHSMILLENQI 9000
PLLGDILHQGT 10000
ALGDGALFYFG 11000
ASEAZZFVQKY 12000
ACTFIZELQDS 13000
RKTLQZKELKI 14000
NGEEZZKSPPN 15000
DYILSWYGNLS 16000
SLFCZZVLSGL 17000
PLAFRZDATSA 18000
ESDSSLNSNVY 19000
LQKQLZDEANR 20000
LSQAZZDFLVC 21000
EDIREEVDTFM 22000
IPAFZZRGYDQ 23000
RSITEZEVSEY 24000
YTETZZVNHHY 25000
KVENZZTRWDY 26000
IGDTZZPIDTF 27000
SATEMIWAVLA 28000
FLDKSZGLQGY 29000
AADTVZLATLY 30000
FIIPNZVVKYS 31000
SRNMIYITCHL 32000
NTEKZZAVECY 33000
WLSDIZPLVQP 34000
KISKZZADIGA 35000
GEARZZVRLKG 36000
TVQTZZTLESL 37000
PAAEPVPHLQT 38000
LPGRDDNSYMY 39000
VEDPQZGKNTV 40000
ARSVPSIAAAT 41000
QGRYENGCAYF 42000
QSNMZZMAATL 43000
QSDMGZREAAY 44000
TSDTGZEGTVV 45000
LTDDZZLHKQY 46000
DLPDPZQWKSS 47000
DLDFZZARQYY 48000
FSQKZZSTLVY 49000
ENTNZZIGSLF 50000
LLDPKZYGMFR 51000
KGYNSKFNLDM 52000
ESKQKZFVLKT 530

AttributeError: 'DataFrame' object has no attribute 'Peptide'

In [21]:
print(data['X_train'])

NameError: name 'data' is not defined