In [2]:
import pandas as pd
import numpy as np
import os

select needed columns from original IEDB data all columns

In [3]:
Epitope_header = "Epitope"
Assay_header = "Assay"
Reference_header = "Reference"
Host_header = "Host"
MHC_Restriction_header = "MHC Restriction"

columns_ls = [
    (Epitope_header, "Name"),
    (Assay_header, "Qualitative Measurement"),
    (Assay_header, "Measurement Inequality"),
    (Assay_header, "Quantitative measurement"),
    (Assay_header, "Units"),
    (MHC_Restriction_header, "Name"),
    (MHC_Restriction_header, "Class"),
    (Reference_header, "Type"),
    (Reference_header, "Date"),
    (Reference_header, "Title"),
    (Assay_header, "Method"),
    (Assay_header, "Response measured"),
    (Host_header, "Name")
]

new_columns_ls = [
    'Description',
    'Qualitative Measure',
    'Measurement Inequality',
    'Quantitative measurement',
    'Units',
    'Allele Name',
    'MHC allele class',
    'Ref Type',
    'Ref Date',
    'Ref Title',
    'Assay Method',
    'Assay Group',
    'Host Name'
]

 # source file directory
original_data = '../original_data/mhc_ligand_full.csv'
column_filter = '../processed_data/mhc_ligand_full_column_filter.csv'

In [4]:
# There are 2 headers in the original_data, header=[0, 1]
df = pd.read_csv(original_data, header=[0, 1])

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# select needed columns, save the new csv file to processed_data directory
df_need = df[columns_ls]
df_need.columns = [new_columns_ls]
df_need.to_csv(column_filter, index=False)

filter available data from processed_data

In [6]:
# read the selected columns csv file
df_all = pd.read_csv('../processed_data/mhc_ligand_full_column_filter.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
desired_host = ['Homo sapiens (human)', 'human (Homo sapiens)']
assay_groups_to_remove = [
    '3D structure', '50% dissociation temperature', 'half life',
    'off rate', 'on rate', 'qualitative binding'
]
measurement_inequalities_to_remove = ['>', '<', '>=', '<=']

# select host --Homo sapiens (human) or human(Homo sapiens)
is_human_host = df_all['Host Name'].isin(desired_host)

# select data with 'Quantitative measurement'
has_quantitative_measurement = df_all['Quantitative measurement'].notnull()

# select standard peptide sequence without 'B X J Z', delete data with '-' or ' '
is_valid_peptide_sequence = ~df_all['Description'].str.contains('-| |B|X|J|Z', regex=True, na=False)

# select accurate allele name with 'HLA' and ':'
is_precise_allele_name = df_all['Allele Name'].str.contains('HLA') & df_all['Allele Name'].str.contains(":")

# select “KD”,“IC50” and “EC50” related BAV 'Assay Group' , delete others like '3D structure' in assay_groups_to_remove
is_relevant_assay_group = ~df_all['Assay Group'].isin(assay_groups_to_remove)

# select 'Units'='nM' data and 'Measurement Inequality' != '>', '<', '>=' and '<=' data
is_valid_measurement = (df_all['Units'] == 'nM') & ~df_all['Measurement Inequality'].isin(measurement_inequalities_to_remove)

# select 'Quantitative measurement' <= 50000 data
is_low_measurement = df_all['Quantitative measurement'] <= 50000

#Apply all conditions to filter data frame
df_desired = df_all[
    is_human_host &
    has_quantitative_measurement &
    is_valid_peptide_sequence &
    is_precise_allele_name &
    is_relevant_assay_group &
    is_valid_measurement &
    is_low_measurement
]


In [8]:
#reset index
df_desired = df_desired.reset_index(drop = True)
#add column Description_Length
df_desired['Description_Length'] = df_desired['Description'].str.len()
#Data normalization [1-log10(IC50)/log10(50000 nM)], IC50 unit is nM, same as MHCflurry paper
df_desired["Normalized_QM"] = 1 - (np.log(df_desired["Quantitative measurement"]) / np.log(50000))
df_desired.to_csv('../processed_data/mhc_ligand_desired_column_filter.csv', index=False) 

In [11]:
df_desired = pd.read_csv('../processed_data/mhc_ligand_desired_column_filter.csv')

#Filter data for MHC class I
filtered_MHC_I_df = df_desired[df_desired['MHC allele class'] == 'I']

#The earliest year in the test set
after_year = 2022

#iedb web tools only support MHC class I with peptide length during 8-14
filtered_df = df_desired[(df_desired['Ref Date'] >= after_year) & (df_desired['MHC allele class'] == 'I') & (df_desired['Description_Length'] <= 14)]

output_file = f'../processed_data/mhc_ligand_{after_year}_MHC_I_desired_column_filter.csv'
filtered_df.to_csv(output_file, index=False)

get IEDB_new_released_dataset

In [6]:
#The smallest size in the test set
allele_size_cutoff = 0
#The earliest year in the test set
after_year = 2022

df_desired = pd.read_csv(f'../processed_data/mhc_ligand_{after_year}_MHC_I_desired_column_filter.csv')
out_dir = f'../processed_data/{after_year}_MHC_I/data'
os.makedirs(out_dir, exist_ok=True)
for name, group in df_desired.groupby('Allele Name') :
    # change the allele name to a valid file name
    name = name.replace('/', '&')
    name = name.replace('*','_')
    name = name.replace(':','')
    
    # skip alleles with less than 10 data points and HLA-A_3303
    # HLA-A_3303 is only in 2022 data, not in the training data(before 2017), so most tools do not support it.
    if (len(group) < allele_size_cutoff) | (name == 'HLA-A_3303') :
        continue
    group.to_csv(f'{out_dir}/{name}.csv', index=False)

In [7]:
# statistics of IEDB_new_released_dataset
files_dir = f'../processed_data/{after_year}_MHC_I/data'
files = os.listdir(files_dir)
statistic = []
for file in files:
    df = pd.read_csv(os.path.join(files_dir,file))
    file_name = file.split('.')[0]
    # number of the peptides
    pep_num = len(df)

    # peptide length range
    pep_len_min = min(map(len,df['Description']))
    pep_len_max = max(map(len,df['Description']))

    # Quantitative measurement range
    QM_min = min(df['Quantitative measurement'])
    QM_max = max(df['Quantitative measurement'])
    
    # logarithm of Quantitative measurement range
    Normalized_QM_min = min(df['Normalized_QM'])
    Normalized_QM_max = max(df['Normalized_QM'])

    statistic.append([file_name,pep_num,pep_len_min,pep_len_max,QM_min,QM_max,Normalized_QM_min,Normalized_QM_max])
    
df_stt = pd.DataFrame(statistic,columns=['HLA-allele','pep_num','pep_len_min','pep_len_max','QM_min','QM_max','Normalized_QM_min','Normalized_QM_max'])
df_stt.sort_values(by='pep_num',axis=0,ascending=False, inplace=True)
df_stt = df_stt.reset_index(drop = True)
df_stt.to_csv(f'../processed_data/{after_year}_MHC_I/statistics.csv', index=False)
