# Mutational Distance and Diversity

This script calculates and plots (1) mutational distance from the each experimentally evolved PSE1 and AAC6 sequence set (2) the pairwise sequence differences between random sets of sequences in each sequence set.

Requires `mutational_distance_and_diversity_data.zip` from https://evcouplings.org/3Dseq

Requires Biopython, pandas, scipy and numpy


In [0]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format='png'

In [0]:
import os, sys, copy, time
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy

from Bio import SeqIO
from Bio import pairwise2

# Global Variables

In [0]:
PROJECT_ROOT = '.'

ROUNDS = 'ROUNDS'
QUERY = 'QUERY'
NATURAL_FULL = 'Natural Full'
RND10 = 'Round 10'
RND20 = 'Round 20'
RND2 = 'Round 2'
RND4 = 'Round 4'
RND8 = 'Round 8'

ALIGNMENT_DIR_PSE1 = PROJECT_ROOT+'/data.PSE1'
ALIGNMENT_DIR_AAC6 = PROJECT_ROOT+'/data.AAC6'
FIGURE_OUTPUT_DIR = PROJECT_ROOT+'/figures'

PSE1_ATTRIBUTES = {
    'name': 'PSE1',
    'alignment_directory': ALIGNMENT_DIR_PSE1,
    'full_natural_alignment_filename': 
        ALIGNMENT_DIR_PSE1+'/7fa1c5691376beab198788a726917d48_b0.4.a2m',
    'label_filenames': {
        QUERY: ALIGNMENT_DIR_PSE1+'/PSE1.fas',
        RND20: ALIGNMENT_DIR_PSE1+'/Rnd20_init.fas', 
        RND10: ALIGNMENT_DIR_PSE1+'/Rnd10.fas'
    },
    'label_color':{ #taken from "tab10" matplotlib colors
        NATURAL_FULL: 'black',
        RND10: '#3a76af',#'firebrick',
        RND20: '#ef8536', #'darkgreen', 
        QUERY: 'yellow'
    }
}

AAC6_ATTRIBUTES = {
    'name': 'AAC6',
    'alignment_directory': ALIGNMENT_DIR_AAC6,
    'full_natural_alignment_filename': 
        ALIGNMENT_DIR_AAC6+'/44883374318b63406a7415d2f4d4cfc1_b0.4.a2m',
    'label_filenames': {
        QUERY: ALIGNMENT_DIR_AAC6+'/WT_PSEAB.fas',
        RND2: ALIGNMENT_DIR_AAC6+'/G2_PSEAB_0_St115_50ksub.fas', 
        RND4: ALIGNMENT_DIR_AAC6+'/G4_PSEAB_0_St115_50ksub.fas', 
        RND8: ALIGNMENT_DIR_AAC6+'/G8_all_PSEAB_0_St115_50ksub.fas',
    },
    'label_color':{
        NATURAL_FULL: 'black',#'gray',
        RND2: '#c53932',#'red', 
        RND4: '#8d6bb8',#'limegreen', 
        RND8: '#85584e',#'cornflowerblue',
        QUERY: 'yellow'
    }
}

ALPHABET = 'ACDEFGHIKLMNPQRSTVWY'
AA_IDX_DICT = {aa:i for i,aa in enumerate(ALPHABET)}
IDX_AA_DICT = {idx: aa for aa, idx in AA_IDX_DICT.items()}

# Setup Data Munging Functions

In [0]:
def getMutationCountFromQuery(query_in_binary, sequences_in_binary):
    '''
    Returns a list of the number of mutations each sequence in sequences has
    relative to the query sequence.
    '''
    toreturn = []
    for target_seq_in_binary in sequences_in_binary:
        num_matching_residues = np.sum( np.logical_and(
            query_in_binary, target_seq_in_binary
        ) )
        toreturn.append( (len(query_in_binary)/len(ALPHABET)) - num_matching_residues )
    return toreturn

def encodeSequencesAsBinary(sequences, flatten_each_sequence = False):
    '''
    Encodes a set of polypeptide sequences in to binary.
    '''
    start = time.time()
    if type(sequences[0]) is SeqIO.SeqRecord: 
        sequences = [str(seq.seq) for seq in sequences]
    
    seq_arr = np.zeros((len(sequences),len(sequences[0]),len(ALPHABET)))
    for i,seq in enumerate(sequences):
        for j,aa in enumerate(seq):
            if aa in AA_IDX_DICT: #non-nomal letters just get all zeros
                seq_arr[i,j,AA_IDX_DICT[aa]] = 1.
                
    if flatten_each_sequence:
        return seq_arr.reshape((len(sequences),-1))
    return seq_arr

def loadSequenceFile(filename, force_upper=True):
    '''
    Loads sequences from a fasta file. 
    '''
    print('Loading "{0}"'.format(filename))
    toreturn = []
    if force_upper:
        for seq in list(SeqIO.parse(filename, 'fasta')):
            seq.seq = seq.seq.upper()
            toreturn.append(seq)
    else: 
        toreturn = list(SeqIO.parse(filename, 'fasta'))
        
    return toreturn


def loadFastafileIntoDF(attributes, 
                        filename, 
                        label):
    '''
    Load a single fastafile into a dataframe.
    '''    
    query_sequence = loadSequenceFile(
        attributes['label_filenames'][QUERY]
    )[0]
    
    toreturn = pd.DataFrame()
    seq_records = loadSequenceFile(filename)

    toreturn['seq_record'] = seq_records
    toreturn['label']= label
    toreturn['color']= attributes['label_color'][label]
    toreturn['seq_name'] = [seq.name for seq in toreturn['seq_record']]
    toreturn['seq_str'] = [ str(seq.seq) for seq in toreturn['seq_record'] ]
    toreturn['num_aminoacids'] = [
        len(seq_str.replace('.','').replace('-','')) for seq_str in toreturn['seq_str']
    ]
    
    #encode sequences as binary
    toreturn['seq_binary'] = list(
        encodeSequencesAsBinary( toreturn['seq_record'] )
    )
    toreturn['seq_binary_flat'] = list(encodeSequencesAsBinary( 
        toreturn['seq_record'], flatten_each_sequence=True
    ))

    #mut count from the query sequence
    toreturn['mutcount_from_query'] = getMutationCountFromQuery(
        encodeSequencesAsBinary([query_sequence], True)[0],
        toreturn.seq_binary_flat
    )
    
    return toreturn


def loadDataframe(attributes, 
                  labels_list=None):
    '''
    Load the dataframe for a set of sequences.
    
    Parameters:
        attributes:       a dictionary of attributes for the requested
                          dataset to load
        labels_list:      a list of labels to load. If None, then the
                          full list will be used that is specified in: 
                             attributes['label_filenames'].keys()
    '''
    toreturn_df = pd.DataFrame()
    
    if labels_list == None: labels_list = attributes['label_filenames'].keys()
    for lbl in labels_list:
        filename = None
        if lbl == NATURAL_FULL:
            filename = attributes['full_natural_alignment_filename']
        else:  filename = attributes['label_filenames'][lbl]
        
        lbl_df = loadFastafileIntoDF(
            attributes, filename, lbl
        )
        
        #add to dataframe that is returned
        toreturn_df = toreturn_df.append( lbl_df, ignore_index=True )
    
    return toreturn_df
    

# Setup Plotting Functions

In [0]:
def calculatePairwiseSeqIdStats(dataframe, 
                                pairwise_samplesize, 
                                only_labels=None, test=False):
    '''
    Calculate pairwise sequence statistics for all labels in the
    given dataframe. Two random, non-overlapping subsamples of sequences 
    of size 'pairwise_samplesize' are compared to each other.
    '''
    lbl_data = {
        'pairwise_sequence_ids': {},
        'pairwise_sequence_diffs': {},
        'num_matching_residues': {},
    }
    
    labels = sorted(pd.unique(dataframe.label))
    if only_labels is not None:
        labels = only_labels
    
    for lbl in labels:
        if lbl == QUERY: continue
            
        label_df = dataframe[dataframe.label == lbl]
        
        #non-overlapping subsample sequences in with this label
        subsample_left = label_df.sample(pairwise_samplesize)
        subsample_right = label_df.drop(subsample_left.index).sample(pairwise_samplesize)
        
        lbl_data['pairwise_sequence_ids'][lbl] = []
        lbl_data['pairwise_sequence_diffs'][lbl] = []
        lbl_data['num_matching_residues'][lbl] = []
        
        largest_seq_length_list = []
        num_matching_residues_list = []
        for left_binary in list(subsample_left.seq_binary_flat):
            for right_binary in list(subsample_right.seq_binary_flat):
                num_matching_residues = np.sum( np.logical_and(
                    left_binary, right_binary
                ) )
                largest_seq_length = np.max([
                    np.sum(left_binary),
                    np.sum(right_binary)
                ])
                lbl_data['pairwise_sequence_ids'][lbl].append(
                    (100.0 * num_matching_residues / largest_seq_length)
                )
                lbl_data['pairwise_sequence_diffs'][lbl].append(
                    100 - (100.0 * num_matching_residues / largest_seq_length)
                )
                lbl_data['num_matching_residues'][lbl].append(num_matching_residues)
    return lbl_data



#
#
# plotting functions
# 
#
        
def plotMutationCountVsPairwiseSequenceID(dataframe, attributes,
                                          pairwise_samplesize = 100,
                                          figure_size=10,
                                          save_as_filename=None):
    '''
    Generates a plot of the mean of the mutation count from wt vs the mean of
    pairwise sequence id (randomly subsampled). Also includes histograms
    next to the axes.
    '''
    marker = 'o'
    
    #calculate sequenceids between random subsample pairs first so the
    #same subsampled pairs are used in both plots
    pairwise_stats = calculatePairwiseSeqIdStats(
        dataframe, pairwise_samplesize
    )
        
    #plots
    for include_natural in [True, False]:
        g = None
        for lbl in sorted(pd.unique(dataframe.label)):
            showLbl = True
            if lbl == QUERY: showLbl = False
            if include_natural == False and \
               lbl in [NATURAL_FULL]:
                showLbl = False
            
            if showLbl:
                label_df = dataframe[dataframe.label == lbl]
                pairwise_sequenceid = pairwise_stats['pairwise_sequence_ids'][lbl]

                median_mutation_count = np.median(label_df.mutcount_from_query)
                stdev_mutation_count = np.std(label_df.mutcount_from_query)

                median_pairwise_sequenceid = np.median(pairwise_sequenceid)
                stdev_pairwise_sequenceid = np.std(pairwise_sequenceid)

                color = list(label_df.color)[0]
                chartlbl = '{0} ({1})'.format(lbl, list(dataframe[dataframe.label == QUERY].seq_name)[0])

                if not g:
                    g = sns.JointGrid(x=[], y=[], data=None, height=10)
                
                #plot error bars
                g.x = median_pairwise_sequenceid
                g.y = median_mutation_count
                g.plot_joint(
                    plt.errorbar, 
                    xerr=stdev_pairwise_sequenceid,
                    yerr=stdev_mutation_count, 
                    color=color,
                    alpha=.4,
                    elinewidth=10,
                )

                #plot main points
                g.plot_joint(
                    plt.scatter, 
                    color=color,
                    marker=marker,
                    s=100,
                    label=chartlbl
                )
                
                if lbl in [NATURAL_FULL]: 
                    #choose whole numbers for bins -- is smooth as the natural
                    #sequences have variable insertions/deletions and so we end
                    #up with quite random fractions for each pairwise sequenceid
                    g.ax_marg_x.hist(
                        pairwise_sequenceid,
                        color=color,
                        alpha = .75,
                        bins=np.arange(
                            int(np.floor(np.min(pairwise_sequenceid))), 
                            int(np.ceil(np.max(pairwise_sequenceid))),
                            1
                        ),
                        density=True #normalize the histogram
                    )
                else:
                    #for lab sequences, calculate bins from the raw number
                    #of shared amino acids for each sequence and then plot
                    #as a percentage -- auto binning is very ugly
                    #NOTE: only possible as sequence length is always the
                    #      same for the lab sequences
                    num_matching_residues_list = pairwise_stats['num_matching_residues'][lbl]
                    myhist = np.histogram(
                        num_matching_residues_list, 
                        bins=np.arange(
                            int(np.min(num_matching_residues_list)), 
                            int(np.max(num_matching_residues_list)),
                            1
                        ),
                        density=True
                    )
                    #offsets are edges at integer spacing. Since we are plotting a
                    #bar plot, move them to the center (0.5 offset).
                    seq_length = len(list(label_df.seq_str)[0])
                    xoffsets = (myhist[1][1:]-0.5) * 100.0 / seq_length 
                    g.ax_marg_x.bar(
                        xoffsets, myhist[0],
                        color=color, alpha=0.75, 
                        width=xoffsets[1] - xoffsets[0]
                    )
                
                g.ax_marg_y.hist(
                    label_df.mutcount_from_query,
                    orientation = 'horizontal',
                    color=color,
                    alpha = .75,
                    density=True,  #normalize the histogram
                    bins=np.arange(
                        int(np.min(label_df.mutcount_from_query)), 
                        int(np.max(label_df.mutcount_from_query)),
                        1
                    )
                )
                
        #g.fig.suptitle(title, fontsize=16, y=1.01)
        g.ax_joint.xaxis.get_label().set_text(
            'median(pairwise sequence id)\n{0:,d} random pairs'.format(
                pairwise_samplesize*pairwise_samplesize)
        )
        g.ax_joint.yaxis.get_label().set_text(
            'median(mutation count from wild type)'
        )
        g.ax_joint.xaxis.get_label().set_fontsize(32)
        g.ax_joint.yaxis.get_label().set_fontsize(32)
        [ticklbl.set_fontsize(20) for ticklbl in g.ax_joint.xaxis.get_ticklabels()]
        [ticklbl.set_fontsize(20) for ticklbl in g.ax_joint.yaxis.get_ticklabels()]
        #g.ax_joint.legend()
        
        if save_as_filename and include_natural:
            g.savefig(save_as_filename+'_allseqs.pdf')
        elif save_as_filename:
            g.savefig(save_as_filename+'_labonly.pdf')

            
def plotMutcountHistograms(dataframe, attributes,
                           only_labels=None,
                           add_mean_indicator=True,
                           add_mean_indicator_label=True,
                           include_legend=True,
                           figure_size = (8,8),
                           save_as_filename=None,
                           save_as_dpi=300,
                           font_size=24,
                           show_axis_labels=True,
                           show_axis_tick_labels=True,
                           override_num_y_bins = None,
                           override_num_x_bins = None,
                           override_y_bins=None,
                           override_x_bins=None,
                           linewidth=3,
                           show_border=True):
    plt.rcParams.update({'font.size': font_size})
    
    labels = sorted(pd.unique(dataframe.label))
    if only_labels: labels = only_labels
    
    fig, ax = plt.subplots(figsize=figure_size)

    ax.patch.set_alpha(1)
    for lbl_idx, lbl in enumerate(only_labels):
        if lbl == QUERY: continue
            
        label_df = dataframe[dataframe.label == lbl]
        color = list(label_df.color)[0]
        
        myhist = np.histogram(
            label_df.mutcount_from_query,
            bins=np.arange(-0.5, np.max(label_df.mutcount_from_query)+0.5, 1),
            density=True
        )
        #print(myhist)
        
        sns.lineplot(
            myhist[1][1:]-0.5, #myhist[1] are edges at discrete integers. Since we
                               #are plotting as a line, remove the first edge and
                               #subtract 0.5 so point falls between what would be the
                               #edges (i.e., edges 1-2 becomes discrete point 1.5)
            myhist[0],
            color=color,
            label=lbl,
            linewidth=linewidth
        )
        
        print('{0} has mean mutation count of {1:.1f}'.format(
            lbl, np.mean(label_df.mutcount_from_query)
        ))
        if add_mean_indicator:
            #add mean indicator
            mean = np.mean(label_df.mutcount_from_query)
            ax.axvline(
                mean,
                color=color, 
                linestyle='dashed', 
                linewidth=linewidth, 
                alpha=0.35
            )
        if add_mean_indicator_label:
            minx, maxx = ax.get_xlim()
            miny, maxy = ax.get_ylim()
            mean = np.mean(label_df.mutcount_from_query)
            ax.text(
                mean + (maxx-minx)/100, 
                np.max(myhist[0]),
                'mean: {0:.1f}'.format(mean),
            )
            
        #ax.set_title('')
        if show_axis_labels:
            ax.set_xlabel('# amino acid differences from {0}\n '.format(attributes['name']))
            ax.set_ylabel('density')
        if include_legend: ax.legend()
        else: ax.get_legend().remove()
            

        if override_num_y_bins is not None:
            if override_num_y_bins != 0:
                plt.locator_params(axis='y', nbins=override_num_y_bins)
            else: plt.yticks([], [])

        if override_num_x_bins is not None:
            if override_num_x_bins != 0:
                plt.locator_params(axis='x', nbins=override_num_x_bins)
            else: plt.xticks([], [])

        if override_x_bins is not None:
            plt.xticks(override_x_bins)
        if override_y_bins is not None:
            plt.yticks(override_y_bins)
            
        if show_axis_tick_labels == False:
            ax.tick_params(labelbottom=False, labelleft=False)
            
            
    if not show_border: plt.box(False)
    plt.tight_layout()
    
    if save_as_filename:
        plt.savefig(save_as_filename, dpi=save_as_dpi)
        
        
def plotSequenceIdHistograms(dataframe, attributes,
                             only_labels=None,
                             pairwise_samplesize=100,
                             add_mean_indicator=True,
                             add_mean_indicator_label=True,
                             include_legend=True,
                             save_as_filename=None,
                             figure_size = (8,8),
                             save_as_dpi=300,
                             font_size=24,
                             show_axis_labels=True,
                             show_axis_tick_labels=True,
                             override_num_y_bins = None,
                             override_num_x_bins = None,
                             override_y_bins=None,
                             override_x_bins=None,
                             linewidth=3,
                             show_border = True):
    plt.rcParams.update({'font.size': font_size})

    labels = sorted(pd.unique(dataframe.label))
    if only_labels: labels = only_labels
    
    #calculate sequenceids between random subsample pairs first so the
    #same subsampled pairs are used in both plots
    pairwise_stats = calculatePairwiseSeqIdStats(
        dataframe, pairwise_samplesize, only_labels=only_labels
    )
    
    fig, ax = plt.subplots(figsize=figure_size)
    ax.patch.set_alpha(1)
    for lbl_idx, lbl in enumerate(only_labels):
        if lbl == QUERY: continue

        label_df = dataframe[dataframe.label == lbl]
        color = list(label_df.color)[0]
        
        if lbl in [NATURAL_FULL]:
            #choose whole numbers for bins -- is smooth as the natural
            #sequences have variable insertions/deletions and so we end
            #up with quite random fractions for each pairwise sequenceid
            pairwise_sequencediffs = pairwise_stats['pairwise_sequence_diffs'][lbl]

            myhist = np.histogram(
                pairwise_sequencediffs,
                bins=np.arange(
                    int(np.floor(np.min(pairwise_sequencediffs))) - 0.5,
                    int(np.ceil(np.max(pairwise_sequencediffs))) + 0.5,
                    1
                ),
                density=True
            )
            
            sns.lineplot(
                myhist[1][1:]-0.5, #myhist[1] are edges at discrete integers. Since we
                                   #are plotting as a line, remove the first edge and
                                   #subtract 0.5 so point falls between what would be the
                                   #edges (i.e., edges 1-2 becomes discrete point 1.5)
                myhist[0],
                color=color,
                label=lbl,
                linewidth=linewidth
            )
        else:
            #for lab sequences, calculate bins from the raw number
            #of shared amino acids for each sequence and then plot
            #as a percentage -- auto binning is very ugly
            #NOTE: only possible as sequence length is always the
            #      same, which is the case for the lab sequences
            num_matching_residues = pairwise_stats['num_matching_residues'][lbl]
            myhist = np.histogram(
                num_matching_residues, 
                bins=np.arange(
                    int(np.min(num_matching_residues)) - 0.5, 
                    int(np.max(num_matching_residues)) + 0.5,
                    1
                ),
                density=True
            )

            #offsets are edges at integer spacing. Since we are plotting a
            #bar plot, move them to the center of the integer offset (equivalent
            #to align='left' in histogram plot)
            seq_length = len(list(label_df.seq_str)[0])
            xoffsets = 100 - ((myhist[1][1:]-0.5) * 100.0 / seq_length)

            sns.lineplot(
                xoffsets,
                myhist[0],
                color=color,
                label=lbl,
                linewidth=linewidth
            )
            
            pairwise_sequencediffs = pairwise_stats['pairwise_sequence_diffs'][lbl]
            print('{0} has mean pairwise sequence difference of {1:.1f}'.format(
                lbl, np.mean(pairwise_sequencediffs)
            ))
            
            mean = np.mean(pairwise_sequencediffs)
            if add_mean_indicator:
                ax.axvline(
                    mean,
                    color=color, 
                    linestyle='dashed', 
                    linewidth=linewidth, 
                    alpha=0.35
                )
            if add_mean_indicator_label:
                minx, maxx = ax.get_xlim()
                miny, maxy = ax.get_ylim()
                ax.text(
                    mean + (maxx-minx)/100, 
                    np.max(myhist[0]),
                    'mean: {0:.1f}%'.format(mean),
                    horizontalalignment='right',
                    verticalalignment='center',
                    fontsize=18
                )
        
        if show_axis_labels:
            ax.set_xlabel('pairwise sequence difference (%)')
            ax.set_ylabel('density')
        ax.set_ylim(ymin=0) 
        if include_legend: ax.legend()
        else: ax.get_legend().remove()

        if override_num_y_bins is not None:
            if override_num_y_bins != 0:
                plt.locator_params(axis='y', nbins=override_num_y_bins)
            else: plt.yticks([], [])

        if override_num_x_bins is not None:
            if override_num_x_bins != 0:
                plt.locator_params(axis='x', nbins=override_num_x_bins)
            else: plt.xticks([], [])
        
        if override_x_bins is not None:
            plt.xticks(override_x_bins)
        if override_y_bins is not None:
            plt.yticks(override_y_bins)
            
        if show_axis_tick_labels == False:
            ax.tick_params(labelbottom=False, labelleft=False)
        
    if not show_border: plt.box(False)
    plt.tight_layout()  
    if save_as_filename:
        plt.savefig(save_as_filename, dpi=save_as_dpi)

# Load Data

In [0]:
pse1_df = loadDataframe(
    PSE1_ATTRIBUTES, 
    labels_list=[QUERY, NATURAL_FULL, RND10, RND20]
)

In [0]:
aac6_df = loadDataframe(
    AAC6_ATTRIBUTES, 
    labels_list=[QUERY, NATURAL_FULL, RND2, RND4, RND8]
)

# Analysis 1: Mutation Count to Query

In [0]:
for show_full_data in [True, False]:
    #save_as_filename = FIGURE_OUTPUT_DIR+'/PSE1_mutcount_lab_hist.pdf'
    if show_full_data:
        save_as_filename = None
    
    plotMutcountHistograms(
        pse1_df, PSE1_ATTRIBUTES,
        only_labels=[RND10, RND20],
        figure_size=(9,8),
        add_mean_indicator=True,
        add_mean_indicator_label=show_full_data,
        include_legend=show_full_data,
        show_axis_labels=show_full_data,
        override_y_bins=[0,.05,.1],
        override_x_bins=[0,20, 40, 60],
        show_axis_tick_labels=show_full_data,
        save_as_filename=save_as_filename
    )

    #save_as_filename = FIGURE_OUTPUT_DIR+'/AAC6_mutcount_lab_hist.pdf'
    if show_full_data:
        save_as_filename = None
    
    plotMutcountHistograms(
        aac6_df, AAC6_ATTRIBUTES,
        only_labels=[RND2, RND4, RND8],
        figure_size=(9,8),
        add_mean_indicator=True,
        add_mean_indicator_label=show_full_data,
        include_legend=show_full_data,
        show_axis_labels=show_full_data,
        override_y_bins=[0,.15,.3],
        override_x_bins=[0, 10, 20],
        show_axis_tick_labels=show_full_data,
        #save_as_filename=save_as_filename
    )

# Analysis 2: Pairwise Sequence Differences

In [0]:
#output with all labels and also without any labels 
#- the latter is for final fig as labels added in illustrator
for show_full_data in [True, False]:
    if show_full_data: pairwise_size=500 #show all annotations on plot - fewer calculations for debugging speed.
    else: pairwise_size=5000
    
    save_as_filename = FIGURE_OUTPUT_DIR+'/PSE1_sequenceid_naturallab_hist.pdf'
    if show_full_data: save_as_filename = None
    plotSequenceIdHistograms(
        pse1_df, PSE1_ATTRIBUTES,
        only_labels=[NATURAL_FULL],#, RND10, RND20],
        add_mean_indicator=False,
        add_mean_indicator_label=False,
        include_legend=False,
        figure_size=(8,4),
        show_axis_labels=False,
        override_num_y_bins=0,
        override_x_bins=[0,50,100],
        show_axis_tick_labels=show_full_data,
        linewidth=3,
        font_size=44,
        pairwise_samplesize=pairwise_size,
        show_border=False,
        #save_as_filename=save_as_filename
    )
    
    save_as_filename = FIGURE_OUTPUT_DIR+'/PSE1_sequenceid_lab_hist.pdf'
    if show_full_data: save_as_filename = None
    plotSequenceIdHistograms(
        pse1_df, PSE1_ATTRIBUTES,
        only_labels=[RND10, RND20],
        pairwise_samplesize=pairwise_size,
        figure_size=(9,8),
        add_mean_indicator=True,
        add_mean_indicator_label=show_full_data,
        include_legend=show_full_data,
        override_y_bins=[0,.05,.1],
        override_x_bins=[0, 15, 30],
        show_axis_tick_labels=show_full_data,
        show_axis_labels=show_full_data,
        #save_as_filename=save_as_filename
    )
    
    
    save_as_filename = FIGURE_OUTPUT_DIR+'/AAC6_sequenceid_naturallab_hist.pdf'
    if show_full_data: save_as_filename = None
    plotSequenceIdHistograms(
        aac6_df, AAC6_ATTRIBUTES,
        only_labels=[NATURAL_FULL],#, RND2, RND4, RND8],
        add_mean_indicator=False,
        add_mean_indicator_label=False,
        include_legend=False,
        figure_size=(8,4),
        show_axis_labels=False,
        override_num_y_bins=0,
        override_x_bins=[0,50,100],
        show_axis_tick_labels=show_full_data,
        linewidth=3,
        font_size=44,
        pairwise_samplesize=pairwise_size,
        show_border=False,
        #save_as_filename=save_as_filename
    )
    
    save_as_filename = FIGURE_OUTPUT_DIR+'/AAC6_sequenceid_lab_hist.pdf'
    if show_full_data: save_as_filename = None
    plotSequenceIdHistograms(
        aac6_df, AAC6_ATTRIBUTES,
        only_labels=[RND2, RND4, RND8],
        pairwise_samplesize=pairwise_size,
        figure_size=(9,8),
        add_mean_indicator=True,
        add_mean_indicator_label=show_full_data,
        include_legend=show_full_data,
        override_y_bins=[0,.1,.2],
        override_x_bins=[0, 10, 20],
        show_axis_tick_labels=show_full_data,
        show_axis_labels=show_full_data,
        #save_as_filename=save_as_filename
    )
    