In [1]:
import pandas as pd
import numpy as np
import os

获取所需要的标签

In [2]:
Epitope_header = "Epitope"
Assay_header = "Assay"
Reference_header = "Reference"
Host_header = "Host"
MHC_Restriction_header = "MHC Restriction"

columns_ls = [
    (Epitope_header, "Name"),
    (Assay_header, "Qualitative Measurement"),
    (Assay_header, "Measurement Inequality"),
    (Assay_header, "Quantitative measurement"),
    (Assay_header, "Units"),
    (MHC_Restriction_header, "Name"),
    (MHC_Restriction_header, "Class"),
    (Reference_header, "Type"),
    (Reference_header, "Date"),
    (Reference_header, "Title"),
    (Assay_header, "Method"),
    (Assay_header, "Response measured"),
    (Host_header, "Name")
]

#single header file column names
# columns_ls = ['Epitope - Name',
#  'Assay - Qualitative Measurement',
#  'Assay - Measurement Inequality',
#  'Assay - Quantitative measurement',
#  'Assay - Units',
#  'MHC Restriction - Name',
#  'MHC Restriction - Class',
# #  'Type',
#  'Reference - Date',
# #  'Title',
# #  'Method/Technique',
#  'Assay - Response measured',
#  'Host - Name']

new_columns_ls = [
    'Description',
    'Qualitative Measure',
    'Measurement Inequality',
    'Quantitative measurement',
    'Units',
    'Allele Name',
    'MHC allele class',
    'Ref Type',
    'Ref Date',
    'Ref Title',
    'Assay Method',
    'Assay Group',
    'Host Name'
]

 # 输入与输出路径
original_data = '/mnt/zt/Dopaap/original_data/mhc_ligand_full.csv'
column_filter = '/mnt/zt/Dopaap/processed_data/mhc_ligand_full_column_filter.csv'

In [3]:
# 单表头数据header=0,双表头数据header=[0, 1]
df = pd.read_csv(original_data, header=[0, 1])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# 获取原始文件中需要的标签。存至processed_data/column_filter”
df_need = df[columns_ls]
df_need.columns = [new_columns_ls]
df_need.to_csv(column_filter, index=False)

删除不需要的数据

In [5]:
#读取文件
df_all = pd.read_csv('/mnt/zt/Dopaap/processed_data/mhc_ligand_full_column_filter.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
desired_host = ['Homo sapiens (human)', 'human (Homo sapiens)']
assay_groups_to_remove = [
    '3D structure', '50% dissociation temperature', 'half life',
    'off rate', 'on rate', 'qualitative binding'
]
measurement_inequalities_to_remove = ['>', '<', '>=', '<=']

# 筛选出host为人类的条目--Homo sapiens (human) 或 human（Homo sapiens）
is_human_host = df_all['Host Name'].isin(desired_host)

# 筛选出有测量标签的数值，即保留 'Quantitative measurement' 列不为空的列
has_quantitative_measurement = df_all['Quantitative measurement'].notnull()

# 保留规范化的多肽序列（不含符号且不含 'B X J Z'），删除序列含有 '-' 或 ' ' 的行
is_valid_peptide_sequence = ~df_all['Description'].str.contains('-| |B|X|J|Z', regex=True, na=False)

# 筛选出精确的 allele name（含 'HLA'，':'）
is_precise_allele_name = df_all['Allele Name'].str.contains('HLA') & df_all['Allele Name'].str.contains(":")

# 排除 'Assay Group' 列不为 “KD”及 “IC50”，“EC50” 相关的条目，如包含特定值（'3D structure' 等）
is_relevant_assay_group = ~df_all['Assay Group'].isin(assay_groups_to_remove)

# 保留 'Units' 列为 'nM'，'Measurement Inequality' 列不包含特定值（'>', '<', '>=', '<='）的行
is_valid_measurement = (df_all['Units'] == 'nM') & ~df_all['Measurement Inequality'].isin(measurement_inequalities_to_remove)

# 保留 'Quantitative measurement' 列值小于等于 50000 的行
is_low_measurement = df_all['Quantitative measurement'] <= 50000

# 最后，排除 'Ref Title' 列为特定值 'Quantitating T cell cross-reactivity for unrelated peptide antigens.' 的行
is_not_specific_ref_title = df_all['Ref Title'] != 'Quantitating T cell cross-reactivity for unrelated peptide antigens.'

# 应用所有条件来过滤数据框
df_desired = df_all[
    is_human_host &
    has_quantitative_measurement &
    is_valid_peptide_sequence &
    is_precise_allele_name &
    is_relevant_assay_group &
    is_valid_measurement &
    is_low_measurement &
    is_not_specific_ref_title
]


In [7]:
#重置索引
df_desired = df_desired.reset_index(drop = True)
# 添加一列，计算 Description 列的长度
df_desired['Description_Length'] = df_desired['Description'].str.len()
# 数据标准化 【1-log10(IC50)/log10(50000 nM)】 ,IC50单位为nM
df_desired["Normalized_QM"] = 1 - (np.log(df_desired["Quantitative measurement"]) / np.log(50000))
df_desired.to_csv('/mnt/zt/Dopaap/processed_data/mhc_ligand_desired_column_filter.csv', index=False) 

In [8]:
#和IEDB工具比较，由于IEDB web tool最大只支持长度14，故将peptide长度小于等于14的数据筛选出来（实际结果只去掉了一个数据，所以保持此处理方式）
df_desired = pd.read_csv('/mnt/zt/Dopaap/processed_data/mhc_ligand_desired_column_filter.csv')

# 根据 MHC allele class 分别筛选数据
filtered_MHC_I_df = df_desired[df_desired['MHC allele class'] == 'I']
filtered_MHC_II_df = df_desired[df_desired['MHC allele class'] == 'II']

# 定义起始年份和结束年份
start_year = 2017
end_year = 2024

# 循环处理每个年份的数据
for year in range(start_year, end_year + 1):
    # 筛选数据
    filtered_df = df_desired[(df_desired['Ref Date'] >= year) & (df_desired['Description_Length'] <= 14)]
    
    # 写入对应的 CSV 文件
    output_file = f'/mnt/zt/Dopaap/processed_data/mhc_ligand_{year}_desired_column_filter.csv'
    filtered_df.to_csv(output_file, index=False)
    
    # 根据 MHC allele class 筛选数据并写入对应的 CSV 文件
    for allele_class, filtered_class_df in zip(['I', 'II'], [filtered_MHC_I_df, filtered_MHC_II_df]):
        filtered_class_df = filtered_class_df[(filtered_class_df['Ref Date'] >= year) & (filtered_class_df['Description_Length'] <= 14)]
        output_file_class = f'/mnt/zt/Dopaap/processed_data/mhc_ligand_{year}_MHC_{allele_class}_desired_column_filter.csv'
        filtered_class_df.to_csv(output_file_class, index=False)


获得new released测试集

In [9]:
start_year = 2017
end_year = 2024

for year in range(start_year, end_year + 1):
    df_desired = pd.read_csv(f'/mnt/zt/Dopaap/processed_data/mhc_ligand_{year}_MHC_I_desired_column_filter.csv')
    out_dir = f'/mnt/zt/Dopaap/processed_data/{year}_MHC_I/data'
    os.makedirs(out_dir, exist_ok=True)
    for name, group in df_desired.groupby('Allele Name') :
        # 替换符号，如将/替换成&
        name = name.replace('/', '&')
        name = name.replace('*','_')
        name = name.replace(':','')
        group.to_csv(f'{out_dir}/{name}.csv', index=False)

In [10]:
# 统计
start_year = 2017
end_year = 2024

#统计了各数据集的条目数，多肽长度的最值，活性值最值，标准化活性值的最值。
for year in range(start_year, end_year + 1):
    files_dir = f'/mnt/zt/Dopaap/processed_data/{year}_MHC_I/data'
    files = os.listdir(files_dir)
    statistic = []
    for file in files:
        df = pd.read_csv(os.path.join(files_dir,file))
        file_name = file.split('.')[0]
        # 条目数
        pep_num = len(df)

        #多肽长度范围
        pep_len_min = min(map(len,df['Description']))
        pep_len_max = max(map(len,df['Description']))

        #QM范围
        QM_min = min(df['Quantitative measurement'])
        QM_max = max(df['Quantitative measurement'])
        
        #log_QM范围
        Normalized_QM_min = min(df['Normalized_QM'])
        Normalized_QM_max = max(df['Normalized_QM'])

        statistic.append([file_name,pep_num,pep_len_min,pep_len_max,QM_min,QM_max,Normalized_QM_min,Normalized_QM_max])
    df_stt = pd.DataFrame(statistic,columns=['HLA-allele','pep_num','pep_len_min','pep_len_max','QM_min','QM_max','Normalized_QM_min','Normalized_QM_max'])
    df_stt.sort_values(by='pep_num',axis=0,ascending=False, inplace=True)
    df_stt = df_stt.reset_index(drop = True)
    df_stt.to_csv(f'/mnt/zt/Dopaap/processed_data/{year}_MHC_I/statistics.csv', index=False)
