# Binding Prediction via API

http://tools.iedb.org/main/tools-api/

## Functions to Read in Conserved Regions

In [40]:
import pandas as pd
import os

def get_csv_filenames(dirname):
    '''
    Returns a list of fully-qualified filenames 
    in dirname which end with ".csv".
    '''
    csv_files = []
    for f in os.listdir(dirname):
        if f.endswith('.csv'):
            csv_files.append(dirname + '/'+ f)
    return csv_files


def build_conserved_dataframe(filelist):
    '''
    Reads in all data from the files in the given filelist and
    returns one DataFrame in which all data is appended together.
    '''
    dlist = []
    for f in filelist:
        
        # read in dataframe for each file
        df = pd.read_csv(f)
        
        # validation for dimensions
        if df.shape[1] != 4:
            raise ValueError('File does not contain the required number of columns:', f)
        
        # add dataframe to list
        dlist.append(df)
        
    return pd.concat(dlist, axis=0, ignore_index=True)

In [45]:
print('test get_csv_filenames()')

print(get_csv_filenames('data'))

get_csv_filenames('data/ConservedRegion_Ryan')

test get_csv_filenames()
[]


['data/ConservedRegion_Ryan/MERS_Conserved copy 2.csv',
 'data/ConservedRegion_Ryan/MERS_Conserved.csv',
 'data/ConservedRegion_Ryan/MERS_Conserved copy.csv']

In [None]:
print('test build_conserved_dataframe()')

build_conserved_dataframe()

## Functions to Define Alleles-Length combinations to test
The API allows us to define multiple alleles to predicting binding against, as well as multiple lengths for each of those alleles. 

The parameters must be listed as 
```allele='Allele1,Allele2'```
and
```length='Length1,Length2'```
where each length matches the order of alleles.

In [36]:
def build_alleles_and_length_strings(alleles,lengths):
    '''
    Given a list of alleles and list of lengths, generates two
    strings: (1) an allele string; (2) a lengths string, then
    returns a tuple (alleles_string, lengths_string).
    
    For each length in lengths, every allele in alleles is
    concatenated into a single comma-delimited string. For
    each allele in alleles, every length in lengths is 
    concatenated into a single comma-delimited string.    
    
    For example, given lengths=[1,2] and alleles=['A','B']:
    
    alleles_string='A,B,A,B'
    lengths_string='1,1,2,2'
    '''
    
    a_list = list()
    l_list = list()
    
    for l in lengths:
        a_list += alleles # create a copy of the alleles for each length
        l_list += [str(l) for i in range(len(alleles))] # generate a length for each allele in the copy
        
    # concatenate elements in each list with comma separator
    joined_a = ",".join(a_list)
    joined_l = ",".join(l_list)
    
    return (joined_a,joined_l) 

In [37]:
print('test build_alleles_and_length_strings()')

hla_ref_set_test = ['HLA-A*01:01','HLA-A*02:01']
lengths_list_test = [9,10]
build_alleles_and_length_strings(hla_ref_set_test,lengths_list_test)

test build_alleles_and_length_strings()


('HLA-A*01:01,HLA-A*02:01,HLA-A*01:01,HLA-A*02:01', '9,9,10,10')

In [90]:
import requests

    
def predict_mhc1(a_set,l_set,sequence):
    '''
    Note that the default value for species is human.
    '''
    
    # define post
    data = {'method':'recommended',
            'allele':a_set,
            'length':l_set,
            'sequence_text':sequence}

    site = 'http://tools-cluster-interface.iedb.org/tools_api/mhci/'  

    # perform prediction
    return requests.post(site, data=data)

import csv

def save_result(response, filename):
    
    with open(filename, mode='w') as write_file:
        file_writer = csv.writer(write_file, delimiter=',')

        for each_row in response.text.split('\n'):
            file_writer.writerow(each_row.split('\t'))

In [33]:
files = get_csv_filenames('data/ConservedRegion_Ryan')

build_conserved_dataframe(files)

ValueError: ('Data in the following file does not contain the required number of columns:', 'data/ConservedRegion_Ryan/MERS_Conserved copy 2.csv')

## MHC I

In [89]:
# HLA allele reference set (27 alleles long)
hla_ref_set_mhc1 = ['HLA-A*01:01','HLA-A*02:01','HLA-A*02:03','HLA-A*02:06','HLA-A*03:01','HLA-A*11:01','HLA-A*23:01','HLA-A*24:02','HLA-A*26:01','HLA-A*30:01','HLA-A*30:02','HLA-A*31:01','HLA-A*32:01','HLA-A*33:01','HLA-A*68:01','HLA-A*68:02','HLA-B*07:02','HLA-B*08:01','HLA-B*15:01','HLA-B*35:01','HLA-B*40:01','HLA-B*44:02','HLA-B*44:03','HLA-B*51:01','HLA-B*53:01','HLA-B*57:01','HLA-B*58:01']



In [91]:
# define parameters for this prediction
read_file = 'data/Binding_Prediction/mhc1_test1/mhc1_test1_api_seq.txt'
write_file = 'data/Binding_Prediction/mhc1_test1/mhc1_test1_api_result.csv'
allele_list = hla_ref_set_mhc1
lengths_list = [9,10]

# perform prediction for this sequence
seq = read_data(read_file)
a_str,l_str = build_alleles_and_length_strings(allele_list, lengths_list)
r1 = predict_mhc1(a_str,l_str,seq)
save_result(r1, write_file)

In [50]:
# combine results into single csv?

## MHC II

In [95]:
# HLA allele reference set
hla_ref_set_mhc2 = ['HLA-DRB1*01:01','HLA-DRB1*03:01','HLA-DRB1*04:01','HLA-DRB1*04:05','HLA-DRB1*07:01','HLA-DRB1*08:02','HLA-DRB1*09:01','HLA-DRB1*11:01','HLA-DRB1*12:01','HLA-DRB1*13:02','HLA-DRB1*15:01','HLA-DRB3*01:01','HLA-DRB3*02:02','HLA-DRB4*01:01','HLA-DRB5*01:01','HLA-DQA1*05:01/DQB1*02:01','HLA-DQA1*05:01/DQB1*03:01','HLA-DQA1*03:01/DQB1*03:02','HLA-DQA1*04:01/DQB1*04:02','HLA-DQA1*01:01/DQB1*05:01','HLA-DQA1*01:02/DQB1*06:02','HLA-DPA1*02:01/DPB1*01:01','HLA-DPA1*01:03/DPB1*02:01','HLA-DPA1*01:03/DPB1*04:01','HLA-DPA1*03:01/DPB1*04:02','HLA-DPA1*02:01/DPB1*05:01','HLA-DPA1*02:01/DPB1*14:01']


In [96]:
def predict_mhc2(a_set,l_set,sequence):
    '''
    API doesn't appear to allow us to set species/locus.
    '''
    
    # define post
    data = {'method':'recommended',
            'allele':a_set,
            'length':l_set,
            'sequence_text':sequence}

    site = 'http://tools-cluster-interface.iedb.org/tools_api/mhcii/'  

    # perform prediction
    return requests.post(site, data=data)

In [99]:
# define parameters for this prediction
read_file = 'data/Binding_Prediction/mhc2_test2/mhc2_test2_api_seq.txt'
write_file = 'data/Binding_Prediction/mhc2_test2/mhc2_test2_api_result.csv'
allele_list = hla_ref_set_mhc2
lengths_list = [15,16]

# perform prediction for this sequence
seq = read_data(read_file)
a_str,l_str = build_alleles_and_length_strings(allele_list, lengths_list)
r2 = predict_mhc2(a_str,l_str,seq)
save_result(r2, write_file)