In [1]:
import numpy as np
import pandas as pd
import random as rnd
from sklearn.model_selection import train_test_split
import os

In [2]:
# insert "Z" in order to get same lenght sequences
def transformEL(dataset):
    dataset = dataset.reset_index(drop=True)
    peptide=dataset.Peptide
    peptide2list=peptide.tolist()
    for i in range(len(peptide)):
        if len(peptide2list[i]) < 11:
            n1 = len(peptide2list[i]) // 2
            n2 = 11 - len(peptide2list[i])
            peptide2list[i] = peptide2list[i][:n1] + 'Z'*n2 + peptide2list[i][n1:]     #将小于11个氨基酸的peptide在中间插空补齐
        else:
            peptide2list[i] = peptide2list[i][:11]

    del dataset['Peptide']
    peptides = pd.DataFrame(peptide2list,columns=['Peptide'])
    dataset.insert(0,'Peptide',peptides.pop('Peptide'))
    return dataset

def Pept_OneHotMap(peptideSeq):
    """ maps amino acid into its numerical index
    USAGE
    Pept_OneHotMap('A')
    array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
    """
    # integer encode input data
    integer_encoded=[char2int[char] for char in peptideSeq]
    # one hot encode
    onehot_encoded = list()
    for value in integer_encoded:
    	letter = [0 for _ in range(len(allSequences))]
    	letter[value] = 1
    	onehot_encoded.append(letter)
    return np.asarray(onehot_encoded)

In [4]:
##################################################################
###all the possible sequence letters
allSequences = 'ACEDGFIHKMLNQPSRTWVYZ'
# Establish a mapping from letters to integers
char2int = dict((c, i) for i, c in enumerate(allSequences))
##################################################################



traindata = os.path.join("./../DATA", "train_data", "A0101_" )
print("traindata name: ", traindata)
df_train = pd.read_csv(traindata, header=0)

#print(df_train, df_train.shape)
df_train = df_train[df_train.Peptide.str.contains('X') == False]
df_train = df_train[df_train.Peptide.str.contains('B') == False]
df_train = df_train[df_train.Peptide.str.contains('U') == False]
print("train data original:\n", df_train, "\n")



# OVERSAMPLING: duplicate by 4 sequences with bindingCategory (class) = 1. It is for imbalance dataset
#######################################################################################################
new_df_0 = df_train.loc[df_train['BindingCategory']== 0].sample(frac = 1) # extract 0 class

df_1_list = [] 
for i in range(4):
    df_1_list.append(df_train.loc[df_train['BindingCategory']== 1])
    new_df_1 = pd.concat(df_1_list)
new_df_train = pd.concat([new_df_0,new_df_1])
new_df_train = new_df_train.sample(frac = 1.0) #shuffle
print("train data oversampling:\n", new_df_train, "\n")

#######################################################################################################


# onehot matrix
train_data=transformEL(new_df_train)

print("train_data after transformEL:\n", train_data, "\n")


trainMatrix = np.empty((0, 11,len(allSequences)), int)      
for num in range(len(train_data.Peptide)):
    if num%1000 == 0:
        print(train_data.Peptide.iloc[num],num)
    trainMatrix = np.append(trainMatrix, [Pept_OneHotMap(train_data.Peptide.iloc[num])], axis=0)
allele_name = train_data['HLA'][0]
assert (trainMatrix.shape[0] == train_data.shape[0])

print("train_data after onehot:\n", trainMatrix, "\n")


traindata name:  ./../DATA/train_data/A0101_
train data original:
        Peptide      HLA  BindingCategory
0    KVDAGKLHY  A*01:01                1
1    ETELDGLRY  A*01:01                1
2   QTDRANRFEY  A*01:01                0
3    MADENKNEY  A*01:01                0
4    RSHQNASAI  A*02:01                0
5  TALMFPNYVSL  A*02:01                0
6  GDSSSGLIQTV  A*02:01                0
7    GAQLPMDPG  A*02:01                0
8    SAALGVPSL  A*02:01                0
9  ALSNNKKDDTK  A*02:01                0 

train data oversampling:
        Peptide      HLA  BindingCategory
5  TALMFPNYVSL  A*02:01                0
1    ETELDGLRY  A*01:01                1
1    ETELDGLRY  A*01:01                1
7    GAQLPMDPG  A*02:01                0
0    KVDAGKLHY  A*01:01                1
0    KVDAGKLHY  A*01:01                1
8    SAALGVPSL  A*02:01                0
1    ETELDGLRY  A*01:01                1
0    KVDAGKLHY  A*01:01                1
0    KVDAGKLHY  A*01:01                1
3  

In [5]:
print(train_data.shape)
print(trainMatrix.shape)
trainMatrix[0]

(16, 3)
(16, 11, 21)


array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])