# Binding Prediction via API

http://tools.iedb.org/main/tools-api/

## Functions to Read Data

In [26]:
import pandas as pd
import os


def get_csv_filenames(dirname):
    '''
    Returns a list of fully-qualified filenames 
    in dirname which end with ".csv".
    '''
    csv_files = []
    for f in os.listdir(dirname):
        if f.endswith('.csv'):
            csv_files.append(dirname + '/'+ f)
    return csv_files


def build_conserved_dataframe(filelist,minlength=0):
    '''
    Reads in all data from the files in the given filelist and
    returns one DataFrame in which all data is appended together.
    '''
    dlist = []
    for f in filelist:
        
        # read in dataframe for each file
        df = pd.read_csv(f)
        
        # validation for dimensions
        if df.shape[1] != 4:
            raise ValueError('File does not contain the required number of columns:', f)
          
        # filter out peptides which don't meet minimum length
        df = df[df['Length'] >= minlength]

        # add dataframe to list
        dlist.append(df)
        
    return pd.concat(dlist, axis=0, ignore_index=True)

In [2]:
print('test get_csv_filenames()')

print(get_csv_filenames('data'))

testlist = get_csv_filenames('data/ConservedRegion_Ryan')
testlist

test get_csv_filenames()
[]


['data/ConservedRegion_Ryan/SARS_Conserved2796.csv']

In [3]:
print('test build_conserved_dataframe()')

df_test = build_conserved_dataframe(testlist).head(2)
df_test

test build_conserved_dataframe()


Unnamed: 0,Index,Sequence,Length,Virus Name
0,"(936, 944)",YRFNGIGVT,9,SARS
1,"(1008, 1023)",VLNDILSRLDKVEAEV,16,SARS


## Functions to Define Alleles-Length combinations to test
The API allows us to define multiple alleles to predicting binding against, as well as multiple lengths for each of those alleles. 

The parameters must be listed as 
```allele='Allele1,Allele2'```
and
```length='Length1,Length2'```
where each length matches the order of alleles.

In [4]:
def build_alleles_and_length_strings(alleles,lengths):
    '''
    Given a list of alleles and list of lengths, generates two
    strings: (1) an allele string; (2) a lengths string, then
    returns a tuple (alleles_string, lengths_string).
    
    For each length in lengths, every allele in alleles is
    concatenated into a single comma-delimited string. For
    each allele in alleles, every length in lengths is 
    concatenated into a single comma-delimited string.    
    
    For example, given lengths=[1,2] and alleles=['A','B']:
    
    alleles_string='A,B,A,B'
    lengths_string='1,1,2,2'
    '''
    
    a_list = list()
    l_list = list()
    
    for l in lengths:
        a_list += alleles # create a copy of the alleles for each length
        l_list += [str(l) for i in range(len(alleles))] # generate a length for each allele in the copy
        
    # concatenate elements in each list with comma separator
    joined_a = ",".join(a_list)
    joined_l = ",".join(l_list)
    
    return (joined_a,joined_l) 

In [5]:
print('test build_alleles_and_length_strings()')

hla_ref_set_test = ['HLA-A*01:01','HLA-A*02:01']
lengths_list_test = [9,10]
build_alleles_and_length_strings(hla_ref_set_test,lengths_list_test)

test build_alleles_and_length_strings()


('HLA-A*01:01,HLA-A*02:01,HLA-A*01:01,HLA-A*02:01', '9,9,10,10')

## Functions to Perform Binding Prediction via API

In [6]:
import requests
  
def predict_binding(a_set,l_set,sequence,site):
    '''
    For MHC I, the default value for species is human.
    For MHC II, the default value for species is not possible to specify.
    '''
    
    # define post
    data = {'method':'recommended',
            'allele':a_set,
            'length':l_set,
            'sequence_text':sequence} 

    # perform prediction
    return requests.post(site, data=data)

In [7]:
print('test predict_binding()')

mhc1_site = 'http://tools-cluster-interface.iedb.org/tools_api/mhci/'
a_set_test = 'HLA-A*01:01'
l_set_test = '9'
sequence = 'SLYNTVATLYCVHQRIDV'

response_test = predict_binding(a_set_test,l_set_test,sequence,mhc1_site)

test predict_binding()


In [8]:
for eachline in response_test.text.split('\n'):
    print(eachline.replace('\t','|'))

allele|seq_num|start|end|length|peptide|core|icore|score|percentile_rank
HLA-A*01:01|1|2|10|9|LYNTVATLY|LYNTVATLY|LYNTVATLY|0.0909|1.1
HLA-A*01:01|1|7|15|9|ATLYCVHQR|ATLYCVHQR|ATLYCVHQR|0.00262|8.2
HLA-A*01:01|1|1|9|9|SLYNTVATL|SLYNTVATL|SLYNTVATL|0.00218|9.1
HLA-A*01:01|1|4|12|9|NTVATLYCV|NTVATLYCV|NTVATLYCV|0.00194|9.8
HLA-A*01:01|1|5|13|9|TVATLYCVH|TVATLYCVH|TVATLYCVH|0.00126|13
HLA-A*01:01|1|8|16|9|TLYCVHQRI|TLYCVHQRI|TLYCVHQRI|0.000714|18
HLA-A*01:01|1|10|18|9|YCVHQRIDV|YCVHQRIDV|YCVHQRIDV|0.000293|30
HLA-A*01:01|1|3|11|9|YNTVATLYC|YNTVATLYC|YNTVATLYC|0.000256|32
HLA-A*01:01|1|6|14|9|VATLYCVHQ|VATLYCVHQ|VATLYCVHQ|0.000111|47
HLA-A*01:01|1|9|17|9|LYCVHQRID|LYCVHQRID|LYCVHQRID|3e-06|98



## Functions to Perform Predictions over entire DataFrame

In [9]:
def get_prediction_results(df,alleles_str,lengths_str,site,mhc_class):
    '''
    Given a DataFrame of conserved sequences, a list of
    alleles to be checked for binding, a list of peptide
    lengths to check for each allele, and an API site,
    performs binding prediction for each sequence in the
    DataFrame and returns the results of all predictions
    as a list of strings, with the first string as the
    column header of the prediction results.
    ''' 
    
    # to store results
    predictions = []
    
    # convert DataFrame to numpy array
    data = df[['Index','Sequence','Length','Virus Name']].to_numpy()
    
    # perform binding prediction for each conserved sequence
    for i,d in enumerate(data):
        
        pos,seq,length,virus = d # unpack data
        print('.',end='') # visualize progress
        
        response = predict_binding(alleles_str, lengths_str, seq, site)
        
        for j,line in enumerate(response.text.split('\n')):
            
            if i == 0 and j == 0:
                header = line + '\toriginal sequence\tvirus\tclass' # add column to header
                predictions.append(header)
            elif i != 0 and j == 0:
                pass # skip additional copies of response header
            elif len(line) == 0:
                pass # skip empty lines
            elif not '\t' in line:
                pass # skip lines which do not contain data
            else:
                entry = line + '\t' + seq + '\t' + virus + '\t' + mhc_class # append columns
                predictions.append(entry)
                
    print() # newline to end visualization
    return predictions

In [10]:
print('test get_prediction_results()')

mhc1_site_teset = 'http://tools-cluster-interface.iedb.org/tools_api/mhci/'
a_set_test = 'HLA-A*01:01'
l_set_test = '9'
mhc_class_test = 'MHC I'

predictions_mhc1_test = get_prediction_results(df_test.head(2),a_set_test,l_set_test,mhc1_site_teset, mhc_class_test)
predictions_mhc1_test[:3]

test get_prediction_results()
..


['allele\tseq_num\tstart\tend\tlength\tpeptide\tcore\ticore\tscore\tpercentile_rank\toriginal sequence\tvirus\tclass',
 'HLA-A*01:01\t1\t1\t9\t9\tYRFNGIGVT\tYRFNGIGVT\tYRFNGIGVT\t6.9e-05\t56\tYRFNGIGVT\tSARS\tMHC I',
 'HLA-A*01:01\t1\t8\t16\t9\tRLDKVEAEV\tRLDKVEAEV\tRLDKVEAEV\t0.0218\t2.5\tVLNDILSRLDKVEAEV\tSARS\tMHC I']

## Functions to Write Predictions to File

In [11]:
import csv


def save_predictions(predictions, filename):
    '''
    Writes the given list of predictions to file in csv
    format with a comma delimiter.
    '''
    with open(filename, mode='w') as write_file:
        file_writer = csv.writer(write_file, delimiter=',')

        for row in predictions:
            file_writer.writerow(row.split('\t'))

In [12]:
print('test save_predictions()')

predictions_test = ['allele\tseq_num\tstart\tend\tlength\tpeptide\tcore\ticore\tscore\tpercentile_rank\toriginal sequence\tvirus\tclass',
 'HLA-A*01:01\t1\t36\t44\t9\tITYQGLFPY\tITYQGLFPY\tITYQGLFPY\t0.339\t0.32\tDIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQGDHGD\tMERS\tMHC I',
 'HLA-A*01:01\t1\t17\t25\t9\tVSKADGIIY\tVSKADGIIY\tVSKADGIIY\t0.264\t0.43\tDIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQGDHGD\tMERS\tMHC I']
writefile_test = 'data/Binding_Prediction/writetest.csv'

save_predictions(predictions_test, writefile_test)

# read in file
write_df_test = pd.read_csv(writefile_test)

# delete file
if os.path.exists(writefile_test):
    os.remove(writefile_test)
    
write_df_test

test save_predictions()


Unnamed: 0,allele,seq_num,start,end,length,peptide,core,icore,score,percentile_rank,original sequence,virus,class
0,HLA-A*01:01,1,36,44,9,ITYQGLFPY,ITYQGLFPY,ITYQGLFPY,0.339,0.32,DIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQG...,MERS,MHC I
1,HLA-A*01:01,1,17,25,9,VSKADGIIY,VSKADGIIY,VSKADGIIY,0.264,0.43,DIQQTFFDKTWPRPIDVSKADGIIYPQGRTYSNITITYQGLFPYQG...,MERS,MHC I


## MHC I

In [15]:
# HLA allele reference set (27 alleles long)
hla_ref_set_mhc1 = ['HLA-A*01:01','HLA-A*02:01','HLA-A*02:03','HLA-A*02:06','HLA-A*03:01','HLA-A*11:01','HLA-A*23:01','HLA-A*24:02','HLA-A*26:01','HLA-A*30:01','HLA-A*30:02','HLA-A*31:01','HLA-A*32:01','HLA-A*33:01','HLA-A*68:01','HLA-A*68:02','HLA-B*07:02','HLA-B*08:01','HLA-B*15:01','HLA-B*35:01','HLA-B*40:01','HLA-B*44:02','HLA-B*44:03','HLA-B*51:01','HLA-B*53:01','HLA-B*57:01','HLA-B*58:01']

# define parameters for this prediction
allele_list = hla_ref_set_mhc1
lengths_list = [9,10]
read_dir = 'data/ConservedRegion_Ryan'
write_file = 'data/Binding_Prediction/mhc1_test3/mhc1_test3_api_result_sars.csv'
site = 'http://tools-cluster-interface.iedb.org/tools_api/mhci/'
mhc_class = 'MHC I'


In [16]:
%%time

# read data
files = get_csv_filenames(read_dir)
df = build_conserved_dataframe(files)

# build query string components
a_str, l_str = build_alleles_and_length_strings(allele_list,lengths_list)

# perform predictions
predictions = get_prediction_results(df,a_str,l_str,site,mhc_class)

# save results to file
save_predictions(predictions,write_file)


....
CPU times: user 42.5 ms, sys: 9.23 ms, total: 51.7 ms
Wall time: 39.6 s


## MHC II

In [17]:
# HLA allele reference set
hla_ref_set_mhc2 = ['HLA-DRB1*01:01','HLA-DRB1*03:01','HLA-DRB1*04:01','HLA-DRB1*04:05','HLA-DRB1*07:01','HLA-DRB1*08:02','HLA-DRB1*09:01','HLA-DRB1*11:01','HLA-DRB1*12:01','HLA-DRB1*13:02','HLA-DRB1*15:01','HLA-DRB3*01:01','HLA-DRB3*02:02','HLA-DRB4*01:01','HLA-DRB5*01:01','HLA-DQA1*05:01/DQB1*02:01','HLA-DQA1*05:01/DQB1*03:01','HLA-DQA1*03:01/DQB1*03:02','HLA-DQA1*04:01/DQB1*04:02','HLA-DQA1*01:01/DQB1*05:01','HLA-DQA1*01:02/DQB1*06:02','HLA-DPA1*02:01/DPB1*01:01','HLA-DPA1*01:03/DPB1*02:01','HLA-DPA1*01:03/DPB1*04:01','HLA-DPA1*03:01/DPB1*04:02','HLA-DPA1*02:01/DPB1*05:01','HLA-DPA1*02:01/DPB1*14:01']

# define parameters for this prediction
allele_list = hla_ref_set_mhc2
lengths_list = [15,16]
read_dir = 'data/ConservedRegion_Ryan'
write_file = 'data/Binding_Prediction/mhc2_test3/mhc2_test3_api_result_sars.csv'
site = 'http://tools-cluster-interface.iedb.org/tools_api/mhcii/'
mhc_class = 'MHC II'


In [28]:
%%time

# read data
files = get_csv_filenames(read_dir)
df = build_conserved_dataframe(files,minlength=15)

# build query string components
a_str, l_str = build_alleles_and_length_strings(allele_list,lengths_list)

# perform predictions
predictions = get_prediction_results(df,a_str,l_str,site,mhc_class)

# save results to file
save_predictions(predictions,write_file)


..
CPU times: user 30.8 ms, sys: 5.8 ms, total: 36.6 ms
Wall time: 9.31 s
