In [1]:
import pandas as pd
import numpy as np
import os
import math

def from_ic50(ic50, max_ic50=50000.0):
    """
    Convert ic50s to regression targets in the range [0.0, 1.0].
    
    Parameters
    ----------
    ic50 : numpy.array of float
    Returns
    -------
    numpy.array of float
    """
    x = 1.0 - (np.log(np.maximum(ic50, 1e-12)) / np.log(max_ic50))
    
    return np.minimum(
        1.0,
        np.maximum(0.0, x))

In [2]:
df_test = pd.read_csv('../original_data/mhcflurry_test_no_mass_spec/test.csv')
df_curated = pd.read_csv('../original_data/curated_training_data.no_mass_spec.csv')

In [3]:
#"Training and Model Selection" dataset = C
#"Model selection" dataset (difference) = T
#Training set train = C-(C intersection T)

## C intersection T
df_merge = pd.merge(df_curated.iloc[:,:-1],df_test.iloc[:,:-2],how="inner")

## train = C-(C intersection T)
df_train_val = pd.concat([df_curated.iloc[:,:-1],df_merge], sort=False)
df_train_val = df_train_val.drop_duplicates(keep=False)  

In [4]:
df_test = df_test.iloc[:,:-2]
df_test['pep_len'] = df_test['peptide'].apply(len)


#Remove entries with measurement values ​​greater than 50,000
df_test_processed = df_test.drop(index=(df_test.loc[(df_test['measurement_value'] > 50000)].index))

#Only keep data whose "measurement inequality" is "="
df_test_processed = df_test_processed.drop(index=(df_test_processed.loc[(df_test_processed['measurement_inequality']== '>' )].index))
df_test_processed = df_test_processed.drop(index=(df_test_processed.loc[(df_test_processed['measurement_inequality']== '<' )].index))

#Data normalization
df_test_processed["Normalized_QM"] = df_test_processed["measurement_value"].apply(from_ic50)


In [6]:
#Split the dataset by allele
test_dir = '../processed_data/mhcflurry_test_no_mass_spec/data'
for name, group in df_test_processed.groupby('allele') :
    if 'HLA' in name:
        name = name.replace('/', '&')
        name = name.replace('*','_')
        name = name.replace(':','')
        pd.DataFrame(group).to_csv(os.path.join(test_dir,name+'.csv'),index=0)

In [9]:
#Count the length of polypeptides in each row and record them in the "pep len" column
df_train_val['pep_len'] = df_train_val['peptide'].apply(len)

#Filter out data with measurement_value <= 50000
df_train_val_processed = df_train_val[df_train_val['measurement_value'] <= 50000]

#Filter out data whose measurement_inequality is "="
df_train_val_processed = df_train_val_processed[df_train_val_processed['measurement_inequality'] == '=']

#Data normalization
df_train_val_processed["Normalized_QM"] = df_train_val_processed["measurement_value"].apply(from_ic50)

In [10]:
#Split the dataset by allele
trainval_dir = '../processed_data/mhcflurry_training_no_mass_spec/data'
out_file = '../processed_data/mhcflurry_training_no_mass_spec/data'
for name, group in df_train_val_processed.groupby('allele') :
    if 'HLA' in name:
        name = name.replace('/', '&')
        name = name.replace('*','_')
        name = name.replace(':','')
        pd.DataFrame(group).to_csv(os.path.join(trainval_dir,name+'.csv'))

In [11]:
#Remove redundancy from each allele data set
#The rule is: if there are 2 redundant data, take the maximum measurement value. If there are 3 or more redundant data, the median value is taken (measurement_value is larger).
for file in os.listdir(trainval_dir):
    
    df = pd.read_csv(os.path.join(trainval_dir,file),index_col=0)
    #Remove duplicates and take the median value
    #Sort by peptide column
    df.sort_values(by='peptide',axis=0,ascending='True', inplace=True)
    df = df.reset_index(drop = True)
    df_uniq = pd.DataFrame(columns = ['peptide','measurement_value'])

    #Get the median of all repeated peptides and their measurements
    j = 1

    for i in range(0, len(df)):
        if(i == 0 or df['peptide'][i] != df['peptide'][i-1]):
            df_tmp = df[df['peptide'] == df['peptide'][i]]

            if(len(df_tmp) > 0):
                df_tmp.sort_values(by='measurement_value',axis=0,ascending='True', inplace=True)
                df_tmp = df_tmp.reset_index(drop = True)
               
                #Even number case:
                if (len(df_tmp)%2) == 0 :
                    index = math.ceil(len(df_tmp) / 2)
                #Odd cases:
                else:
                    index = math.ceil(len(df_tmp) / 2) - 1    

                measurement = df_tmp['measurement_value'][index]
                df_uniq = df_uniq.append({'peptide':df['peptide'][i],'measurement_value':measurement}, ignore_index=True)

    df = df.drop_duplicates(subset = ['peptide'], keep = 'first')
    df = df.reset_index(drop = True)

    #Reconfirm. Find the same peptide and corresponding measurement_value in df as df_unique
    for i in range(0, len(df_uniq)):
        df.loc[df['peptide'] == df_uniq['peptide'][i], ['measurement_value']] = df_uniq['measurement_value'][i]

    df.to_csv(os.path.join(out_file,file),index=0) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
test_dir = '../processed_data/mhcflurry_test_no_mass_spec/data'
trainval_dir = '../processed_data/mhcflurry_training_no_mass_spec/data'
trainval_dir_new = '../processed_data/mhcflurry_training_no_mass_spec/data'

for file in os.listdir(test_dir):
    df_tv = pd.read_csv(os.path.join(trainval_dir,file))
    df_t = pd.read_csv(os.path.join(test_dir,file))

    # C intersection T
    df_m= pd.merge(df_tv,df_t,how="inner")
    #train = C-(C intersection T)
    df_tv = pd.concat([df_tv,df_m],sort=False)
    df_tv = df_tv.drop_duplicates(keep=False)  
    
    df_tv.to_csv(os.path.join(trainval_dir_new,file),index=0)

In [12]:
#Statistical training set after duplication removal were counted.
Allele_dir = '../processed_data/mhcflurry_training_no_mass_spec/data'

files = os.listdir(Allele_dir)
statistic = []
for file in files:
    Allele = file.split('.')[0]
    df = pd.read_csv(os.path.join(Allele_dir,file))
    pep_num = len(df)
    pep_len_min = min(map(len,df['peptide']))
    pep_len_max = max(map(len,df['peptide']))
    QM_min = min(df['measurement_value'])
    QM_max = max(df['measurement_value'])

    statistic.append([Allele,pep_num,pep_len_min,pep_len_max,QM_min,QM_max])


df_stt = pd.DataFrame(statistic,columns=['allele','pep_num','pep_len_min','pep_len_max','QM_min','QM_max'])
df_stt.sort_values(by='allele',axis=0,ascending=True, inplace=True)
df_stt = df_stt.reset_index(drop = True)
df_stt.to_csv('../processed_data/mhcflurry_training_no_mass_spec/statistics.csv')


In [13]:
#Statistical test set
Allele_dir = '../processed_data/mhcflurry_test_no_mass_spec/data'

files = os.listdir(Allele_dir)
statistic = []
for file in files:
    Allele = file.split('.')[0]
    df = pd.read_csv(os.path.join(Allele_dir,file))

    pep_num = len(df)
    pep_len_min = min(map(len,df['peptide']))
    pep_len_max = max(map(len,df['peptide']))
    QM_min = min(df['measurement_value'])
    QM_max = max(df['measurement_value'])

    statistic.append([Allele,pep_num,pep_len_min,pep_len_max,QM_min,QM_max])


df_stt = pd.DataFrame(statistic,columns=['allele','pep_num','pep_len_min','pep_len_max','QM_min','QM_max'])
df_stt.sort_values(by='allele',axis=0,ascending=True, inplace=True)
df_stt = df_stt.reset_index(drop = True)
df_stt.to_csv('../processed_data/mhcflurry_test_no_mass_spec/statistics.csv')
