In [None]:
# reorganise to-be plotted dictionary into dataframe that can be inputted into scatter_dot_plot function below
def scatterplot_matrixprep(inputdict, valuecolumnname): # inputdictionary should have structure dict[allele] = value
    %store -r protection_score
    df = pd.DataFrame(columns = ['allele', 'protection', valuecolumnname]) # initiate df
    for allele in inputdict: 
        if protection_score[allele] != 'unknown' and protection_score[allele] != 0: # select HLA alleles with known score
            if protection_score[allele] < 0: # group these alleles in two groups, P/D
                protection = 'Detrimental'
            else:
                protection = 'Protective'
            value = inputdict[allele] # obtain to be plotted y value
            row = [allele, protection, value] # make list containing allele name, protection group (P/D) and y value
            df = df.append(pd.DataFrame([row], columns = ['allele', 'protection', valuecolumnname])) # append row for each allele to df
    return df

In [None]:
def scatter_dot_plot_mannwhitney(df, valuecolumn, ylabel, ymin, ymax): # takes output of scatterplot_matrixprep, name for y values, y axis label name and min/max of y axis
    Detrimental = list(df[df['protection']=='Detrimental'][valuecolumn]) # split input df into two groups
    Protective = list(df[df['protection']=='Protective'][valuecolumn])
    print stats.mannwhitneyu(Detrimental, Protective, alternative='less') # perform MannWhitney U test, one-sided

    # Plot graph
    data = [Detrimental, Protective]
    fig = plt.figure(1, figsize=(6,4))
    ax = fig.add_subplot(111)
    bp = ax.boxplot(data, whis=6) # original graph is boxplot to show median. Outliers from 6 (random high number) xIQR to prevent showing outliers
    for i,d in enumerate(['Detrimental', 'Protective']): # add individual values for both P and D
        y = df[df['protection'] == d][valuecolumn] # d = [D, P]
        x = np.random.normal(i+1, 0, len(y)) # i = [0, 1]
        plt.plot(x, y, mec = 'none', mfc = 'r', ms=4, marker="o", linestyle="None", alpha=0.8)

    plt.setp(bp['whiskers'], color='w') # change colour of boxplot to white to only show median
    plt.setp(bp['caps'], color='w') # ^
    plt.setp(bp['boxes'], color='w') # ^
    plt.setp(bp['medians'], color='k') # only show median, line in black
    plt.xticks([1, 2], ['Detrimental', 'Protective'])
    plt.ylim([ymin,ymax])
    plt.ylabel(ylabel)
    plt.show()

In [None]:
def rel_bar_chart(relative_preference_final): # bar chart for relative binding preferance of HLA alleles for HIV proteins
    ind = range(len(HLAknown)) #initiate right lengt of x-axis
    width = 0.8 #width of bars

    #for each protein, calculate mean amount of epitopes (and stdev) of all strains, for each allele
    env = [np.mean([relative_preference_final['env'][HLA].values()]) for HLA in HLAknown]
    enverr = [np.std([relative_preference_final['env'][HLA].values()]) for HLA in HLAknown]
    gag = [np.mean([relative_preference_final['gag'][HLA].values()]) for HLA in HLAknown]
    gagerr = [np.std([relative_preference_final['gag'][HLA].values()]) for HLA in HLAknown]
    pol = [np.mean([relative_preference_final['pol'][HLA].values()]) for HLA in HLAknown]
    polerr = [np.std([relative_preference_final['pol'][HLA].values()]) for HLA in HLAknown]
    acc = [np.mean([relative_preference_final['acc'][HLA].values()]) for HLA in HLAknown]
    accerr = [np.std([relative_preference_final['acc'][HLA].values()]) for HLA in HLAknown]

    # HLA-B18:05 has ambigious protection, so leave out this allele and corresponding x-label 
    xlabels = HLAknown[:16] + [" "] + HLAknown[17:] 
    
    #plot 4 plots (in same figure), bottom argument specifies the bottom coordinate of the 2nd and following bars of stacked bar chart
    p1 = plt.bar(ind, gag, width, color='skyblue', yerr=gagerr)
    p2 = plt.bar(ind, pol, width, bottom=sumzip(gag), color='yellowgreen', yerr=polerr)
    p3 = plt.bar(ind, acc, width, bottom=sumzip(gag, pol), color='gold', yerr=accerr)
    p4 = plt.bar(ind, env, width, bottom=sumzip(gag, pol, acc), color='lightcoral', yerr=enverr)
    plt.xticks(ind, (xlabels), rotation=90) # define position of x ticks, labels, and vertical rotation
    plt.ylim([0,100])
    add_legend_4()
    plt.ylabel("no. of binders")
    plt.title("Relative binding preference per HLA allele")
    plt.show()

In [None]:
# below graph will separate the no. of presented peptides for the various clades, therefore, we make a separate graph for each allele
def bar_chart_clades(allele, fo_grouped_Clade_ABC_obv): # bar chart per HIV clade of no. of presented peptides per protein
    xlabels = fo_grouped_Clade_ABC_obv[allele]['env'].keys() # list of 5 clade names
    xlabels.sort() # sort in order HIV1-A1 HIV1-B HIV1-C HIV2-A HIV2-B
    ind = range(len(fo_grouped_Clade_ABC_obv[allele]['env'])) # define length of x axis
    width = 0.5 # width of bars
    
    # for each protein, and clade, take values (no. of peptides presented from each strain) and average or take std. dev
    # create list of length 5, containing values for each of the 5 clades
    env = [np.mean(fo_grouped_Clade_ABC_obv[allele]['env'][clade].values()) for clade in xlabels]
    enverr = [np.std(fo_grouped_Clade_ABC_obv[allele]['env'][clade].values()) for clade in xlabels]
    gag = [np.mean(fo_grouped_Clade_ABC_obv[allele]['gag'][clade].values()) for clade in xlabels]
    gagerr = [np.std(fo_grouped_Clade_ABC_obv[allele]['gag'][clade].values()) for clade in xlabels]
    pol = [np.mean(fo_grouped_Clade_ABC_obv[allele]['pol'][clade].values()) for clade in xlabels]
    polerr = [np.std(fo_grouped_Clade_ABC_obv[allele]['pol'][clade].values()) for clade in xlabels]
    acc = [np.mean(fo_grouped_Clade_ABC_obv[allele]['acc'][clade].values()) for clade in xlabels]
    accerr = [np.std(fo_grouped_Clade_ABC_obv[allele]['acc'][clade].values()) for clade in xlabels]
    
    # plot bar for each of the 4 proteins with corresponding error bars, define bottom of second protein as being 
    # top of first protein (sumzip function, see below) to generate stacked bar chart
    p1 = plt.bar(ind, gag, width, color='skyblue', yerr=gagerr)
    p2 = plt.bar(ind, pol, width, bottom=sumzip(gag), color='yellowgreen', yerr=polerr)
    p3 = plt.bar(ind, acc, width, bottom=sumzip(gag, pol), color='gold', yerr=accerr)
    p4 = plt.bar(ind, env, width, bottom=sumzip(gag, pol, acc), color='lightcoral', yerr=enverr)
    plt.xticks(ind, (xlabels), rotation=90) # define position of x ticks, labels, and vertical rotation
    plt.title(allele)
    
    add_legend_4() # add legend with color code of the 4 proteins, see below for function
    plt.ylabel('no. of binders')
    plt.ylim([0,100])
    plt.show()

In [None]:
def sumzip(*items): # takes any number of items as input; input is the y-values of all below stacks of the bar chart 
    return [sum(values) for values in zip(*items)] # add up y values to determine the bottom y-value of the next protein section

In [None]:
def add_legend_4(): # make legend with 4 proteingroups, colorcoded to match bar_chart_clades
    import matplotlib.patches as mpatches
    Acc = mpatches.Patch(color='gold', label='Acc')
    Pol = mpatches.Patch(color='yellowgreen', label='Pol')
    Gag = mpatches.Patch(color='skyblue', label='Gag')
    Env = mpatches.Patch(color='lightcoral', label='Env')
    plt.legend(handles=[Env, Acc, Pol, Gag], loc=(1.04,0.71))

In [None]:
def bar_chart_coev(inputdict, allele): # takes inputdictionary of structure dict[population][clade] = no. of presented peptides, and allele (HLA-A/ HLA-B) if inputdictionary is a subselection
    index = np.arange(len(inputdict)) # indicate position of x ticks
    bar_width = 0.35
    pops = inputdict.keys() # list of populations

    # generate list with means or std.devs of the no. of presented peptides by the enriched alleles in the three populations
    means_HIV1B = [np.mean(inputdict[pop]['HIV1-B'].values()) for pop in pops] 
    std_HIV1B = [np.std(inputdict[pop]['HIV1-B'].values()) for pop in pops]
    means_HIV1C = [np.mean(inputdict[pop]['HIV1-C'].values()) for pop in pops]
    std_HIV1C = [np.std(inputdict[pop]['HIV1-C'].values()) for pop in pops]

    # plot two bars next to one another, i.e. x-position of second plot needs to be adjusted (index + bar_width)
    p1 = plt.bar(index, means_HIV1B, bar_width,color='teal',yerr=std_HIV1B,label='HIV1-B')
    p2 = plt.bar(index + bar_width, means_HIV1C, bar_width,color='mediumturquoise',yerr=std_HIV1C,label='HIV1-C')

    plt.xlabel('Population')
    plt.ylabel('No. of binders (std)')
    plt.title('Coevolution ' + allele) # if inputdictionary was subselection, this will be mentioned in title 
    plt.xticks(index + bar_width / 2, pops) # position x-labels in middle of two bars
    plt.legend(loc=(1.04,0.85))

    plt.show()