In [1]:
import os
import pandas as pd

Only polypeptides of a certain length can be predicted each time by iedb web tools , and the prediction results of polypeptides of various lengths under each allele are combined by below code.

In [3]:
# Specify the directory path
original_directory = '../STDbenchmark_iedb_tools_original_predict_data'
output_directory = '../STDbenchmark_iedb_tools_predict_data_processed'

# Get a list of all subdirectories in the original_directory
subdirectories = [subdir for subdir in os.listdir(original_directory) if os.path.isdir(os.path.join(original_directory, subdir))]

# Iterate over each subdirectory
for subdir in subdirectories:
    subdir_path = os.path.join(original_directory, subdir)
    
    # Initialize a DataFrame to merge data into
    merged_data = pd.DataFrame()
    
    # Get a list of all CSV files in the subdirectory
    csv_files = [file for file in os.listdir(subdir_path) if file.endswith('.csv')]
    
    # Iterate over each CSV file and merge its data into the DataFrame
    for file in csv_files:
        file_path = os.path.join(subdir_path, file)
        data = pd.read_csv(file_path)
        merged_data = pd.concat([merged_data, data])
    
    # Save the merged data to a new CSV file
    merged_data.drop_duplicates(inplace=True)
    merged_data.reset_index(drop=True, inplace=True)
    out_sub_dir = os.path.join(output_directory, subdir)
    os.makedirs(out_sub_dir, exist_ok=True)
    out_file_path = os.path.join(out_sub_dir, 'merged_data.csv')
    merged_data.to_csv(out_file_path, index=False)


Combine the `Quantitative measurement` and other information downloaded from the IEDB database with the predicted results from the iedb web tools.

In [4]:
import numpy as np

In [7]:
processed_iedb_data_directory = '../../../processed_data/STDbenchmark_allele'
processed_tools_prediction_data_directory = '../STDbenchmark_iedb_tools_predict_data_processed'

tools_prediction_columns_modify_dict = {
    'peptide': 'Description',
    'netmhcpan_ba IC50': 'netmhcpan_ba_BAV',
    'ann IC50': 'ann_BAV',
    'smmpmbec IC50': 'smmpmbec_BAV',
    'smm IC50': 'smm_BAV',
    'mhcflurry IC50': 'mhcflurry_BAV'
}

tools_prediction_columns = ['netmhcpan_ba_BAV', 'ann_BAV', 'smmpmbec_BAV', 'smm_BAV', 'mhcflurry_BAV']

tools_prediction_need_columns = list(tools_prediction_columns_modify_dict.keys())
allele_list = [allele for allele in os.listdir(processed_tools_prediction_data_directory) if os.path.isdir(os.path.join(processed_tools_prediction_data_directory, allele))]

for allele in allele_list:
    df_iedb_allele_QM = pd.read_csv(os.path.join(processed_iedb_data_directory,f'{allele}.csv'))
    df_tools_prediction = pd.read_csv(os.path.join(processed_tools_prediction_data_directory, allele, 'merged_data.csv'))
    
    for col in tools_prediction_need_columns:
        if col not in df_tools_prediction.columns:
            
            df_tools_prediction[col] = pd.NA
            
    df_tools_prediction = df_tools_prediction[tools_prediction_need_columns]
    df_tools_prediction.rename(columns=tools_prediction_columns_modify_dict, inplace=True)
    merged_df = pd.merge(df_iedb_allele_QM, df_tools_prediction, on="Description", how="outer").drop_duplicates()
    
    for tools_prediction in tools_prediction_columns:
        if not merged_df[tools_prediction].isna().any():
            merged_df[f'{tools_prediction}_Normalized'] = 1 - (np.log(merged_df[tools_prediction]) / np.log(50000))
            
    merged_df.rename(columns={'Description': 'peptide'}, inplace=True)
    
    #Save merged results to CSV file
    merged_df.to_csv(f'{processed_tools_prediction_data_directory}/{allele}/tools_processed.csv', index=False)
    