### Subfunctions

In [3]:
#rearranges a nested dictionary with structure d = {A:{B:value}} to d = {B:{A:value}}
def dict_transpose_AB_BA(inputdict): 
    AB = pd.DataFrame(inputdict)
    BA = AB.transpose().to_dict()
    return BA

In [4]:
#rearranges a nested dictionary with structure d = {A:{B:{C:value}}} to d = {C:{A:{B:value}}}
def dict_transpose_ABC_CAB(inputdict): 
    ACB = {}
    for A in inputdict:
        CB = dict_transpose_AB_BA(inputdict[A])
        ACB[A] = CB
    CAB = dict_transpose_AB_BA(ACB)
    return CAB

In [5]:
#rearranges a nested dictionary with structure d = {A:{B:{C:{D:value}}} to d = {D:{A:{B:{C:value}}}
def dict_transpose_ABCD_DABC(inputdict):
    ADBC = {}
    for A in inputdict:
        DBC = dict_transpose_ABC_CAB(inputdict[A])
        ADBC[A] = DBC
    DABC = dict_transpose_AB_BA(ADBC)
    return DABC

#### Calculate absolute amount of binding epitopes per HLA allele

In [9]:
def group_for_dict_abs(): # prep for dict generation, accessory proteins are grouped into one new 'protein': acc
    proteins_grouped = {} #initiate final dict containing 4 proteingroups
    acc = pd.DataFrame() #initiate df
    for protein in ['env','gag', 'pol', 'tat', 'rev', 'nef', 'vpr', 'vpu', 'vif']:
        print protein, #allows to see progress when executing function
        output = pd.read_csv('/Users/pcevaal/Outputs/HIV12_%s_output9.xls' %protein, sep = '\t', engine='python') #open netMHCpan output
        if protein in ['tat', 'rev', 'nef', 'vpr', 'vpu', 'vif']: #for these accessory proteins
            acc = acc.append(output[1:]) #append df to total acc df
        elif protein == 'env': 
            proteins_grouped['env'] = output #add netMHCpan output to final dictionary under key of according protein
        elif protein == 'gag': 
            proteins_grouped['gag'] = output #idem
        elif protein == 'pol': 
            proteins_grouped['pol'] = output #idem
    proteins_grouped['acc'] = acc #add added netMHCpan output for all accessory proteins to final dictionary

    return proteins_grouped

In [None]:
def group_for_dict_abs_coev(): #copy of above function group_for_dict_abs, only imports other netMHCpan data
    proteins_grouped = {}
    acc = pd.DataFrame()
    for protein in ['env','gag', 'pol', 'tat', 'rev', 'nef', 'vpr', 'vpu', 'vif']:
        print protein,
        output = pd.read_csv('/Users/pcevaal/Outputs/HIV1BC_%s_output9.xls' %protein, sep = '\t', engine='python')
        if protein in ['tat', 'rev', 'nef', 'vpr', 'vpu', 'vif']:
            acc = acc.append(output[1:])
        elif protein == 'env': 
            proteins_grouped['env'] = output
        elif protein == 'gag': 
            proteins_grouped['gag'] = output
        elif protein == 'pol': 
            proteins_grouped['pol'] = output
    proteins_grouped['acc'] = acc

    return proteins_grouped

In [1]:
#subfunction within main functions to_dict_abs_ungrouped and to_dict_abs_grouped
def count_per_strain(output, cutoff): # output has been defined in main function, cutoff is rankvalue in netMHCpan output
    ids = list(set(list(output["Unnamed: 1"][1:]))) #creates set of all strain-IDs tested in netMHCpan
    per_strain = [output[output["Unnamed: 1"]==id] for id in ids] # splits netMHCpan output in list of dfs per strain
    strains = {}
    for strain in per_strain:
        ranks = pd.concat([strain.iloc[:, :2], strain.iloc[:, 4::3]], axis=1) #remove columns with 1-log50k and nM values
        ranks.iloc[:, 2:62] = ranks.iloc[:, 2:62].apply(pd.to_numeric,  errors='coerce', downcast = 'float') 
        # do this otherwise ranks aren't recognized and thus not counted properly
        xclusion = ranks[~ranks['Unnamed: 0'].str.contains('X')] # exclude all epitopes containing IUPAC aa 'X' 
        count = xclusion[xclusion <= cutoff].count() #count all binders (i.e. all epitopes <= cutoff) per allele
        allele = {}
        for x in range(2, len(count)):
            allele[count.axes[0][x].encode()] = count[x] # convert count output (df) into dict
        strains[ranks.iloc[1,1].encode()] = allele #add to total dict with strain-ID as key and subdict with binders as value
    return strains

In [8]:
def to_dict_abs_grouped(proteins_grouped, cutoff): #uses output of group_for_dict_abs(_coev) and required cutoff as input for this subfunction
    total = {} #initiate dict
    for protein in proteins_grouped: #env, pol, gag, acc
        print protein, #check progression
        output = proteins_grouped[protein] #select netMHCpan output for one of 4 protein(groups) 
        strains = count_per_strain(output, cutoff) #count no of binding epitopes (rank <= cutoff) per allele per strain
        total[protein] = strains #add to final dict
    return total

# Relative preference

#### Calculate relative amount of binding epitopes per protein of total top 100 strongest binding epitopes per HLA allele

In [10]:
def group_for_dict_rel(): # prep for dict generation, accessory proteins are grouped into one new 'protein': acc
    proteins_grouped_rel = pd.DataFrame() #initiate df
    for protein in ['env', 'gag', 'pol', 'tat', 'rev', 'nef', 'vpr', 'vpu', 'vif']: # open all netMHCpan output files
        print protein, #check progression
        output = pd.read_csv('/Users/pcevaal/Outputs/HIV12_%s_output9.xls' %protein, sep = '\t', engine='python') #open netMHCpan output
        if protein in ['env', 'pol', 'gag']: # add extra column to output df with protein name
            output['protein'] = protein
        else: # add extra column to output df with name 'accessory'
            output['protein'] = 'acc'
        proteins_grouped_rel = proteins_grouped_rel.append(output[1:]) # add the 4 resulting dfs to 'total', without labels in 1st row
    # 'proteins_grouped' is now one big dataframe with all proteins, annotated to one of 4 groups
    return proteins_grouped_rel

In [9]:
def to_dict_rel_grouped(proteins_grouped_rel):
    ids = list(set(list(proteins_grouped_rel["Unnamed: 1"][1:]))) # generates list of unique strain names
    per_strain = [proteins_grouped_rel[proteins_grouped_rel["Unnamed: 1"]==id1] for id1 in ids] # splits 'total' df into list of smaller df per strain
    %store -r HLAknown 
    # list of all HLA alleles with known protection score
    relative_preference = {}
    for strain in per_strain:
        xclusion = strain[~strain['Unnamed: 0'].str.contains('X')] # exclude all epitopes containing IUPAC aa 'X' 
        HLAs = {}
        for HLA in HLAknown: 
            sort = strain.sort_values(HLA) # sort df to column HLA, containing ranks
            top100 = sort[:100] # select top 100 best binding peptides for this HLA allele
            if HLA == 'HLA-B18:01': # ambigious literature on this allele, make counts 0 to generate empty bar in graph
                counts = {'env':0, 'gag':0, 'pol':0, 'acc':0}
            else: # count occurances of each value (proteins) in top100 peptides, and convert output into dictionary
                counts = top100['protein'].value_counts().to_dict() 
            HLAs[HLA] = counts 
        relative_preference[strain['Unnamed: 1'][1]] = HLAs 
    # hierarchy of relative_preferance: [strain][allele][protein]

    # perform double transposing to reorganise dictionary to new hierarchy
    relative_preference_final = dict_transpose_ABC_CAB(dict_transpose_AB_BA(relative_preference))
    return relative_preference_final
    # hierarchy of relative_preference_final: [protein][allele][strain] = count

# Conservation

#### Dictionary generation for conservation studies, requires storage of epitope sequences rather than merely counting their number

In [None]:
def to_dict_peptideseqs_grouped(proteins_grouped, cutoff): #takes output of group_for_dict_abs as input, as well as cutoff for netMHCpan output ranks
    %store -r HLAknown
    total = {}
    for protein in proteins_grouped:
        print protein, #check progress
        output = proteins_grouped[protein] #select netMHCpan output for one of 4 protein(groups) 
        ids = sorted(list(set(list(output["Unnamed: 1"][1:])))) #creates set of all strain-IDs tested in netMHCpan
        per_strain = [output[output["Unnamed: 1"]==id] for id in ids] # splits netMHCpan output in list of dfs per strain
        strains = {}
        for strain in per_strain:
            ranks = pd.concat([strain.iloc[:, :2], strain.iloc[:, 4::3]], axis=1) #remove columns with 1-log50k and nM values
            ranks.iloc[:, 2:62] = ranks.iloc[:, 2:62].apply(pd.to_numeric,  errors='coerce', downcast = 'float') # do this otherwise ranks aren't recognized and thus not counted properly
            xclusion = ranks[~ranks['Unnamed: 0'].str.contains('X')] # exclude all epitopes containing IUPAC aa 'X' 
            alleles = {}
            for allele in HLAknown:
                binders = list(xclusion[xclusion[allele] <= cutoff]['Unnamed: 0']) #for epitopes with rank <= cutoff, add to epitope sequence to list
                alleles[allele] = binders
            strains[strain.iloc[1,1]] = alleles #use strain ID as key
        total[protein] = strains
    return total

In [None]:
def add_clades(inputdict):
    %store -r Reflist
    #inputdict hierarchy: [protein][strain][allele]
    inputdict_df = pd.DataFrame(inputdict).transpose()
    # inputdict_df hierarchy: [strain][protein][allele]
    inputdict_ABC_temp = {strain: inputdict_df[strain] for strain in Reflist} # strains from HIV1A-C, HIV2A-B
    tempdf = pd.DataFrame(inputdict_ABC_temp).transpose() # each row is one strain, df is sorted 
    Clades = {}
    Clades['HIV2-A'] = tempdf.iloc[:4].to_dict() # first 4 rows are reference strains of clade HIV2-A
    Clades['HIV2-B'] = tempdf.iloc[4:9].to_dict() # next 5 are reference strains of HIV2-B
    Clades['HIV1-A1'] = tempdf.iloc[9:12].to_dict() # etc
    Clades['HIV1-B'] = tempdf.iloc[12:16].to_dict()
    Clades['HIV1-C'] = tempdf.iloc[16:20].to_dict()
    # Clades: [clades][protein][strain][allele]
    # to be: [allele][protein][clade][strain]
    fo_grouped_Clade_ABC = dict_transpose_ABCD_DABC(dict_transpose_AB_BA(Clades))

    #%store -r HLAobvious
    #fo_grouped_Clade_ABC_obv = {HLA: fo_grouped_Clade_ABC[HLA] for HLA in HLAobvious[0]}
    # [allele][protein][clade][strain] - count or list of binders
    return fo_grouped_Clade_ABC

In [None]:
def conserved_epitopes_within(inputdict, cutoff): #inputdict is peptideseqs_clades, 
                        #cutoff is number of other clades in which epitope needs to be present in order to be clasified as conserved
    conserved_dict = {}
    for allele in inputdict:
        #print allele,
        proteins = {}
        for protein in inputdict[allele]: 
            clades = {}
            for clade in inputdict[allele][protein]:
                strains = {}
                for strain in inputdict[allele][protein][clade]: # for each strain in each clade, protein, allele:
                    epitopes = {}
                    for epitope in inputdict[allele][protein][clade][strain]:
                        conservation = -1.0 #set initial value at -1, as all epitopes will occur in it's own clade, so conservation score will always increase to 0
                        for strain2 in inputdict[allele][protein][clade]: #loop over all strains again 
                            if epitope in inputdict[allele][protein][clade][strain2]: #if epitope occurs in one of the sets, add 1 to conservation(score)
                                conservation += 1.0 
                        if conservation >= cutoff: #set cutoff at 1 for minimal conservation, 4 for conservation across all analysed clades
                            epitopes[epitope] = conservation
                    strains[strain] = epitopes #{strain:{epitope:conservation score}}
                clades[clade] = strains 
            proteins[protein] = clades
        conserved_dict[allele] = proteins
    return conserved_dict

In [None]:
def conserved_epitopes_all(inputdict, cutoff): #inputdict is peptideseqs, 
                        #cutoff is number of other clades in which epitope needs to be present in order to be clasified as conserved
    conserved_dict = {}
    for allele in inputdict:
        #print allele,
        proteins = {}
        for protein in inputdict[allele]: 
            strains = {}
            #epitopelist = []
            for strain in inputdict[allele][protein]: # for each strain in each protein, allele:
                epitopes = {}
                for epitope in inputdict[allele][protein][strain]:
                    conservation = -1.0
                    for strain2 in inputdict[allele][protein]:
                        if epitope in inputdict[allele][protein][strain2]: #if epitope occurs in one of the sets, add 1 to conservation(score)
                            conservation += 1.0 #will end up range 0:?
                    if conservation >= cutoff: #set cutoff at 1 for minimal conservation, 4 for conservation across all analysed clades
                        epitopes[epitope] = conservation
                strains[strain] = epitopes
            proteins[protein] = strains
        conserved_dict[allele] = proteins
    return conserved_dict