## Notebook Outline

1. import statements
2. merging the metadata and the omics data
3. Making pictures and t-tests

In [1]:
### Step 1 - import a bunch of libraries
from pandas import DataFrame, read_table
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats

sns.set(style="whitegrid", color_codes=True)


In [2]:
#### Step 2 - merging the metadata and the omics data
# In these images we are looking at global abundance of groups of proteins
# and whether those pools change dramatically by condition. Therefore
# we are using the iBAQ mol percent measurements from MaxQuant. We are not
#looking at a single protein's change, but rather the percent of a cell
# which is made up of a bulk group. 

R_pal_omics_file = "R_pal_aux_files\R_pal_iBAQ_MolPercent.txt"
df_omics = pd.read_table(R_pal_omics_file, sep="\t")
R_pal_metadata_file = "R_pal_metadata_df.txt"
df_metadata = pd.read_table(R_pal_metadata_file, sep='\t')

# For Figure 1A, . We are plotting function using COG, specifically the COG_meta 
# because it is a convenient higher level functional grouping. 
# So we will slice that out
# and *remove duplicates* before we actually merge. It should be much easier.
# ALWAYS REMEMBER - the metadata df potentially has duplicates when you remove columns
to_keep_meta = ['RefSeq', 'locus', 'COG_meta','Essential_aerobic', 'Essential_phototrophic', 'Essential_longevity']
df_temp = df_metadata[to_keep_meta]
df_metadata_slim1 = df_temp.drop_duplicates()


# the omics data for this also has a lot of extra columns I don't want.
#Its all the replicates, but here i'm just using averages
to_keep_omics = ['aerobic', 'nitro_anaerobic', 'nonnitro_anaerobic', 'Protein IDs'] # don't like these names, but they are what they are
df_temp = df_omics[to_keep_omics]
df_omics_slim = df_temp.rename(index=str, columns={'nonnitro_anaerobic':'phototrophic', 'nitro_anaerobic':'N_fixing' })


df_1A = df_metadata_slim1.merge(df_omics_slim, left_on='RefSeq', right_on='Protein IDs', how='right')
#now drop the duplicate index
df_1A.drop('RefSeq', axis=1, inplace=True)


In [3]:
# Figure 1A - simple comparison of essential versus non-essential.
# for the first figure, I need to collapse all the experimental measurements into one column
# to fit with the way the seaborn thinks about plotting.
# we do that with the 'melt' command 
# since this is the first figure to make, I'm projecting out a new datafram and calling it df_1A_melt

#also of note, we have essential for three conditions. For this first graphic, I'm only
#going to worry about the 'aerobic' condition
experiments = ['aerobic', 'N_fixing', 'phototrophic']
df_1A_melt = pd.melt(df_1A, id_vars=['Essential_aerobic', 'Protein IDs'], value_vars=experiments, var_name='experiments', value_name='expression')
df_1A_melt.head()

Unnamed: 0,Essential_aerobic,Protein IDs,experiments,expression
0,True,WP_011155572.1,aerobic,0.002384
1,True,WP_011155573.1,aerobic,0.066656
2,True,WP_011155575.1,aerobic,0.010625
3,False,WP_011155576.1,aerobic,0.044177
4,False,WP_011155578.1,aerobic,0.001923


In [21]:
# Plotting stuff for 1A
# Here I would ideally like to have a pair of columns show essential versus non-essential
# and the sets of columns represent the different experiments - aerobic, anaerobic
# for this we use df1 - the melted data frame

plt.close('all') # this should close figures from previous runs of this cell
plt.figure(figsize=(20,10))
y_top = 0.2
y_bottom = 0
plt.ylim(y_bottom, y_top)
sns.boxplot(x='experiments', y='expression', hue='Essential_aerobic', data=df_1A_melt, showfliers=False)

#now we do some statistical tests.
essential = df_1A_melt[df_1A_melt['Essential_aerobic'] == True]
nonessential = df_1A_melt[df_1A_melt['Essential_aerobic'] == False]
x_axis_counter = 0
for exp in experiments:
    e = essential[essential['experiments'] == exp]['expression']
    n = nonessential[nonessential['experiments'] == exp]['expression']
    (t_stat, p_value) = stats.mannwhitneyu(n, e, alternative='two-sided')
    print ("exp: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(exp, round(p_value, 3), len(e), len(n)))
    # now manually enter the p-values on the plot
    x = x_axis_counter # figure out how to get the right x column
    y = y_top - 0.025 # height of the horizontal line. 15 units above the highest value
    h=0.002 # height of the U that are are building
    black = 'k' # the color black
    plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
    if p_value < 0.01:
        plotstring = "* p ~ %s"%round(p_value, 3)
    else:
        plotstring = "p ~ %s"%round(p_value, 3)
    plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
    x_axis_counter += 1 # increment during the loop
    

filename = 'boxplot_A.png'
filepath = os.path.join("R_pal_images", filename)
plt.savefig(filepath, dpi=300)

#plt.show()

exp: aerobic 	pval: 0.0 	 num_ess: 453 	 num_non: 2090
exp: N_fixing 	pval: 0.0 	 num_ess: 453 	 num_non: 2090
exp: phototrophic 	pval: 0.0 	 num_ess: 453 	 num_non: 2090


In [5]:
## Figure 1B split out the translation machinery - COG category J
# so this time I need to melt while keeping the COG category, actually probably first
# make a new column that is J or notJ.
# I do this by merging a silly data frame with only one value, that matches
# the COG category. everything else is null after the merge, which I back-fill with False
Translation = []
Translation.append({'translation': True, 'COG_category':'J'})
df_temp = pd.DataFrame(data=Translation)
#now going back to the original df_metadata, keeping just the COG_category
to_drop = ['COG', 'COG_meta', 'Pfam_ID', 'Pfam_name','gene_product']
df_metadata_slim2 = df_metadata.drop(to_drop, axis=1)
df_metadata_slim2.drop_duplicates(inplace=True)

#step 1 - merge in this new column of translation to the meta-data
df_metadata_slim3 = df_metadata_slim2.merge(df_temp, left_on='COG_category', right_on="COG_category", how='left')
df_metadata_slim3['translation'].fillna(False, inplace=True)
#step 2 - now merge in the quantitative values
df_1B = df_metadata_slim3.merge(df_omics_slim, left_on='RefSeq', right_on='Protein IDs', how='right')
#now drop the duplicate index
df_1B.drop('RefSeq', axis=1, inplace=True)
df_1B.head()

Unnamed: 0,locus,COG_category,Essential_aerobic,Essential_phototrophic,Essential_longevity,translation,aerobic,N_fixing,phototrophic,Protein IDs
0,RPA0001,L,True,False,False,False,0.002384,0.000647,0.000246,WP_011155572.1
1,RPA0002,L,True,False,False,False,0.066656,0.066429,0.065157,WP_011155573.1
2,RPA0004,L,True,False,False,False,0.010625,0.005479,0.011583,WP_011155575.1
3,RPA0005,E,False,False,False,False,0.044177,5.9e-05,0.006953,WP_011155576.1
4,RPA0007,T,False,False,False,False,0.001923,0.003011,0.002104,WP_011155578.1


In [6]:
#### Making Figure 1B
# Print off each experiment individually, where data is separated by IS or ISNOT translation.
#for this we use df_1B
for data_col in experiments:
    #1. close up previous figures and set some variables
    plt.close('all') # this should close figures from previous runs of this cell
    plt.figure(figsize=(20,10))
    y_top = 0.75
    y_bottom = 0
    plt.ylim(y_bottom, y_top)
    #2. plot data
    sns.boxplot(x='translation', y=data_col, hue='Essential_aerobic', data=df_1B, showfliers=False)
    #3. now the statistics
    essential = df_1B[df_1B['Essential_aerobic'] == True]
    nonessential = df_1B[df_1B['Essential_aerobic'] == False]
    x_axis_counter = 0
    for i in [False, True]: # this is to loop over whether something IS or ISNOT translation
        e = essential[essential['translation'] == i][data_col]
        n = nonessential[nonessential['translation'] == i][data_col]
        (t_stat, p_value) = stats.ttest_ind(n, e, equal_var=False)
        #print ("translation: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(i, round(p_value, 3), len(e), len(n)))
        x=x_axis_counter
        y = y_top - 0.05 # height of the horizontal line. 
        h=0.01 # height of the U that are are building
        black = 'k' # the color black
        plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
        if p_value < 0.01:
            plotstring = "* p ~ %s"%round(p_value, 3)
        else:
            plotstring = "p ~ %s"%round(p_value, 3)
        plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
        x_axis_counter += 1 # increment during the loop

    filename = 'boxplot_B_%s.png'%(data_col)
    filepath = os.path.join("R_pal_images", filename)
    plt.savefig(filepath, dpi=300)
    #plt.show()

In [7]:
### fisher's exact test for enrichment of translational proteins in the essential list
e_t = 0 #Essential_Translation
e_n = 0 # Essential_nonTranslation
n_t = 0 # NonEssential_Translation
n_n = 0 #Non_essential_nonTranslation

#### contingency table
#  Essential
#  \   y     n
# _____________
# J  |e_t | n_t |
#
# !J |e_n | n_n |

e = essential[essential['translation'] == True][data_col]
n = nonessential[nonessential['translation'] == True][data_col]
e_t = len(e)
n_t = len(n)
e = essential[essential['translation'] == False][data_col]
n = nonessential[nonessential['translation'] == False][data_col]
e_n = len(e)
n_n = len(n)

oddsratio, pvalue = stats.fisher_exact([[e_t, n_t], [e_n, n_n]])
print (e_t, n_t)
print (e_n, n_n)
print ("fishers is ", pvalue)         

81 68
372 2022
fishers is  1.39824276355e-25


In [27]:
#### Figure 1C - split out metabolic and other
#### We are just using the original data frame df_1A

#there's a significant number of non-annotated things Let's put in 'n' for those

df_1C = df_1A.copy()
df_1C['COG_meta'].fillna('n',inplace=True)
COG_meta_labels = ['M', 'T', 'O','n'] # the .unique is giving me problems because it is coming up with 'nan'
COG_meta_labels.sort() #essential to sort to make sure that the looping matches the image
essential = df_1C[df_1C['Essential_aerobic'] == True]
nonessential = df_1C[df_1C['Essential_aerobic'] == False]

for data_col in experiments:
    #now selecting out only one column
    plt.close('all') # this should close figures from previous runs of this cell
    plt.figure(figsize=(20,10))
    y_top = 0.3
    y_bottom = 0
    plt.ylim(y_bottom, y_top)
    #it's pretty important that the 'data' below be sorted by COG_category because that's
    #how we know that it's lined up with the 'things_in_M' array
    sns.boxplot(x='COG_meta', y=data_col, hue='Essential_aerobic', data=df_1C.sort_values(by="COG_meta"), showfliers=False)
    
    print ("experiment: %s" %data_col)
    x_axis_counter = 0
    for COG_m in COG_meta_labels:
        #here I'm trying to slice out data to get a simple list to send into the T-test
        #when I find a better way of doing this, I will certainly do so
        # for a given Cog meta-category (e.g. M) and experimental dataset (e.g. anaerobic)
        e = essential[essential['COG_meta'] == COG_m][data_col]
        n = nonessential[nonessential['COG_meta'] == COG_m][data_col]
        (t_stat, p_value) = stats.mannwhitneyu(n, e, alternative='two-sided')
        print ("COG_m: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(COG_m, round(p_value, 3), len(e), len(n)))
        #set up some plotting coordinates
        x = x_axis_counter # figure out how to get the right x column
        y = y_top - 0.025 # height of the horizontal line. 15 units above the highest value
        h=0.002 # height of the U that are are building
        black = 'k' # the color black
        plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
        if p_value < 0.01:
            plotstring = "* %s"%round(p_value, 3)
        else:
            plotstring = "p ~ %s"%round(p_value, 3)
        plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
        x_axis_counter += 1 # increment during the loop
    filename = 'boxplot_C_%s.png'%(data_col)
    filepath = os.path.join("R_pal_images", filename)
    plt.savefig(filepath, dpi=300)


experiment: aerobic
COG_m: M 	pval: 0.0 	 num_ess: 180 	 num_non: 805
COG_m: O 	pval: 0.014 	 num_ess: 87 	 num_non: 429
COG_m: T 	pval: 0.0 	 num_ess: 113 	 num_non: 180
COG_m: n 	pval: 0.0 	 num_ess: 73 	 num_non: 676
experiment: N_fixing
COG_m: M 	pval: 0.0 	 num_ess: 180 	 num_non: 805
COG_m: O 	pval: 0.937 	 num_ess: 87 	 num_non: 429
COG_m: T 	pval: 0.0 	 num_ess: 113 	 num_non: 180
COG_m: n 	pval: 0.001 	 num_ess: 73 	 num_non: 676
experiment: phototrophic
COG_m: M 	pval: 0.0 	 num_ess: 180 	 num_non: 805
COG_m: O 	pval: 0.339 	 num_ess: 87 	 num_non: 429
COG_m: T 	pval: 0.0 	 num_ess: 113 	 num_non: 180
COG_m: n 	pval: 0.0 	 num_ess: 73 	 num_non: 676


In [None]:
#if for example you want to see the actual data that is plotted
#df_1A_melt.to_csv('boxplot_A_data.txt', sep="\t", index=False)

In [24]:
# Now to try something fun. remove the genes which are aerobic essential and see if stuff shows us different.
# df_2 is using essential phototrophic stuff
df_2A = df_1A.drop(df_1A[df_1A.Essential_aerobic == True].index)
df_2A_melt = pd.melt(df_2A, id_vars=['Essential_phototrophic', 'Protein IDs'], value_vars=experiments, var_name='experiments', value_name='expression')

df_2A_melt.head()

Unnamed: 0,Essential_phototrophic,Protein IDs,experiments,expression
0,False,WP_011155576.1,aerobic,0.044177
1,False,WP_011155578.1,aerobic,0.001923
2,False,WP_011155579.1,aerobic,0.014462
3,False,WP_011155580.1,aerobic,0.006772
4,False,WP_011155581.1,aerobic,0.007074


In [20]:
# Plotting stuff for 2A
# same as 1A, but the essentiality is based on essential_phototrophic

plt.close('all') # this should close figures from previous runs of this cell
plt.figure(figsize=(20,10))
y_top = 0.2
y_bottom = 0
plt.ylim(y_bottom, y_top)
sns.boxplot(x='experiments', y='expression', hue='Essential_phototrophic', data=df_2A_melt, showfliers=False)

#now we do some statistical tests.
essential = df_2A_melt[df_2A_melt['Essential_phototrophic'] == True]
nonessential = df_2A_melt[df_2A_melt['Essential_phototrophic'] == False]
x_axis_counter = 0
for exp in experiments:
    e = essential[essential['experiments'] == exp]['expression']
    n = nonessential[nonessential['experiments'] == exp]['expression']
    (t_stat, p_value) = stats.mannwhitneyu(n, e, alternative='two-sided')
    print ("exp: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(exp, round(p_value, 3), len(e), len(n)))
    # now manually enter the p-values on the plot
    x = x_axis_counter # figure out how to get the right x column
    y = y_top - 0.025 # height of the horizontal line. 15 units above the highest value
    h=0.002 # height of the U that are are building
    black = 'k' # the color black
    plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
    if p_value < 0.01:
        plotstring = "* p ~ %s"%round(p_value, 3)
    else:
        plotstring = "p ~ %s"%round(p_value, 3)
    plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
    x_axis_counter += 1 # increment during the loop
    

filename = 'boxplot_D.png'
filepath = os.path.join("R_pal_images", filename)
plt.savefig(filepath, dpi=300)


exp: aerobic 	pval: 0.0 	 num_ess: 135 	 num_non: 1955
exp: N_fixing 	pval: 0.0 	 num_ess: 135 	 num_non: 1955
exp: phototrophic 	pval: 0.0 	 num_ess: 135 	 num_non: 1955


In [22]:

#### Making Figure 2B
# same as 1B, but with esseential comiing from essential_photoprophic

df_2B = df_1B.drop(df_1B[df_1B.Essential_aerobic == True].index)
for data_col in experiments:
    #1. close up previous figures and set some variables
    plt.close('all') # this should close figures from previous runs of this cell
    plt.figure(figsize=(20,10))
    y_top = 0.75
    y_bottom = 0
    plt.ylim(y_bottom, y_top)
    #2. plot data
    sns.boxplot(x='translation', y=data_col, hue='Essential_phototrophic', data=df_2B, showfliers=False)
    #3. now the statistics
    essential = df_2B[df_2B['Essential_phototrophic'] == True]
    nonessential = df_2B[df_2B['Essential_phototrophic'] == False]
    x_axis_counter = 0
    for i in [False, True]: # this is to loop over whether something IS or ISNOT translation
        e = essential[essential['translation'] == i][data_col]
        n = nonessential[nonessential['translation'] == i][data_col]
        (t_stat, p_value) = stats.ttest_ind(n, e, equal_var=False)
        #print ("translation: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(i, round(p_value, 3), len(e), len(n)))
        x=x_axis_counter
        y = y_top - 0.05 # height of the horizontal line. 
        h=0.01 # height of the U that are are building
        black = 'k' # the color black
        plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
        if p_value < 0.01:
            plotstring = "* p ~ %s"%round(p_value, 3)
        else:
            plotstring = "p ~ %s"%round(p_value, 3)
        plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
        x_axis_counter += 1 # increment during the loop

    filename = 'boxplot_E_%s.png'%(data_col)
    filepath = os.path.join("R_pal_images", filename)
    plt.savefig(filepath, dpi=300)
    #plt.show()

In [28]:
#### Figure 2C - split out metabolic and other


df_2C = df_2A.copy()
df_2C['COG_meta'].fillna('n',inplace=True)
COG_meta_labels = ['M', 'T', 'O', 'n'] # the .unique is giving me problems because it is coming up with 'nan'
COG_meta_labels.sort() #essential to sort to make sure that the looping matches the image
essential = df_2C[df_2C['Essential_phototrophic'] == True]
nonessential = df_2C[df_2C['Essential_phototrophic'] == False]

for data_col in experiments:
    #now selecting out only one column
    plt.close('all') # this should close figures from previous runs of this cell
    plt.figure(figsize=(20,10))
    y_top = 0.3
    y_bottom = 0
    plt.ylim(y_bottom, y_top)
    #it's pretty important that the 'data' below be sorted by COG_category because that's
    #how we know that it's lined up with the 'things_in_M' array
    sns.boxplot(x='COG_meta', y=data_col, hue='Essential_phototrophic', data=df_2C.sort_values(by="COG_meta"), showfliers=False)
    
    print ("experiment: %s" %data_col)
    x_axis_counter = 0
    for COG_m in COG_meta_labels:
        #here I'm trying to slice out data to get a simple list to send into the T-test
        #when I find a better way of doing this, I will certainly do so
        # for a given Cog meta-category (e.g. M) and experimental dataset (e.g. anaerobic)
        e = essential[essential['COG_meta'] == COG_m][data_col]
        n = nonessential[nonessential['COG_meta'] == COG_m][data_col]
        (t_stat, p_value) = stats.mannwhitneyu(n, e, alternative='two-sided')
        print ("COG_m: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(COG_m, round(p_value, 3), len(e), len(n)))
        #set up some plotting coordinates
        x = x_axis_counter # figure out how to get the right x column
        y = y_top - 0.025 # height of the horizontal line. 15 units above the highest value
        h=0.002 # height of the U that are are building
        black = 'k' # the color black
        plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
        if p_value < 0.01:
            plotstring = "* %s"%round(p_value, 3)
        else:
            plotstring = "p ~ %s"%round(p_value, 3)
        plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
        x_axis_counter += 1 # increment during the loop
    filename = 'boxplot_F_%s.png'%(data_col)
    filepath = os.path.join("R_pal_images", filename)
    plt.savefig(filepath, dpi=300)


experiment: aerobic
COG_m: M 	pval: 0.0 	 num_ess: 86 	 num_non: 719
COG_m: O 	pval: 0.525 	 num_ess: 18 	 num_non: 411
COG_m: T 	pval: 0.001 	 num_ess: 15 	 num_non: 165
COG_m: n 	pval: 0.298 	 num_ess: 16 	 num_non: 660
experiment: N_fixing
COG_m: M 	pval: 0.0 	 num_ess: 86 	 num_non: 719
COG_m: O 	pval: 0.196 	 num_ess: 18 	 num_non: 411
COG_m: T 	pval: 0.001 	 num_ess: 15 	 num_non: 165
COG_m: n 	pval: 0.231 	 num_ess: 16 	 num_non: 660
experiment: phototrophic
COG_m: M 	pval: 0.0 	 num_ess: 86 	 num_non: 719
COG_m: O 	pval: 0.023 	 num_ess: 18 	 num_non: 411
COG_m: T 	pval: 0.0 	 num_ess: 15 	 num_non: 165
COG_m: n 	pval: 0.06 	 num_ess: 16 	 num_non: 660


In [29]:
# Now to try something fun. remove the genes which are aerobic essential or photosynthetic and see if stuff shows us different.
# df_3 is using essential longevity stuff
df_3A = df_2A.drop(df_2A[df_2A.Essential_phototrophic == True].index)
df_3A_melt = pd.melt(df_3A, id_vars=['Essential_longevity', 'Protein IDs'], value_vars=experiments, var_name='experiments', value_name='expression')

df_3A_melt.head()

Unnamed: 0,Essential_longevity,Protein IDs,experiments,expression
0,False,WP_011155576.1,aerobic,0.044177
1,False,WP_011155578.1,aerobic,0.001923
2,False,WP_011155579.1,aerobic,0.014462
3,False,WP_011155580.1,aerobic,0.006772
4,False,WP_011155581.1,aerobic,0.007074


In [37]:
# Plotting stuff for 3A
# same as 1A, but the essentiality is based on essential_longeviety

plt.close('all') # this should close figures from previous runs of this cell
plt.figure(figsize=(20,10))
y_top = 0.1
y_bottom = 0
plt.ylim(y_bottom, y_top)
sns.boxplot(x='experiments', y='expression', hue='Essential_longevity', data=df_3A_melt, showfliers=False)

#now we do some statistical tests.
essential = df_3A_melt[df_3A_melt['Essential_longevity'] == True]
nonessential = df_3A_melt[df_3A_melt['Essential_longevity'] == False]
x_axis_counter = 0
for exp in experiments:
    e = essential[essential['experiments'] == exp]['expression']
    n = nonessential[nonessential['experiments'] == exp]['expression']
    (t_stat, p_value) = stats.mannwhitneyu(n, e, alternative='two-sided')
    print ("exp: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(exp, round(p_value, 3), len(e), len(n)))
    # now manually enter the p-values on the plot
    x = x_axis_counter # figure out how to get the right x column
    y = y_top - 0.025 # height of the horizontal line. 15 units above the highest value
    h=0.002 # height of the U that are are building
    black = 'k' # the color black
    plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
    if p_value < 0.01:
        plotstring = "* p ~ %s"%round(p_value, 3)
    else:
        plotstring = "p ~ %s"%round(p_value, 3)
    plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
    x_axis_counter += 1 # increment during the loop
    

filename = 'boxplot_G.png'
filepath = os.path.join("R_pal_images", filename)
plt.savefig(filepath, dpi=300)


exp: aerobic 	pval: 0.913 	 num_ess: 71 	 num_non: 1884
exp: N_fixing 	pval: 0.201 	 num_ess: 71 	 num_non: 1884
exp: phototrophic 	pval: 0.394 	 num_ess: 71 	 num_non: 1884


In [36]:
#### Making Figure 3B
# same as 1B, but with esseential comiing from essential_photoprophic

df_3B = df_2B.drop(df_2B[df_2B.Essential_phototrophic == True].index)
for data_col in experiments:
    #1. close up previous figures and set some variables
    plt.close('all') # this should close figures from previous runs of this cell
    plt.figure(figsize=(20,10))
    y_top = 0.10
    y_bottom = 0
    plt.ylim(y_bottom, y_top)
    #2. plot data
    sns.boxplot(x='translation', y=data_col, hue='Essential_longevity', data=df_3B, showfliers=False)
    #3. now the statistics
    essential = df_3B[df_3B['Essential_longevity'] == True]
    nonessential = df_3B[df_3B['Essential_longevity'] == False]
    x_axis_counter = 0
    for i in [False, True]: # this is to loop over whether something IS or ISNOT translation
        e = essential[essential['translation'] == i][data_col]
        n = nonessential[nonessential['translation'] == i][data_col]
        (t_stat, p_value) = stats.ttest_ind(n, e, equal_var=False)
        #print ("translation: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(i, round(p_value, 3), len(e), len(n)))
        x=x_axis_counter
        y = y_top - 0.05 # height of the horizontal line. 
        h=0.01 # height of the U that are are building
        black = 'k' # the color black
        plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
        if p_value < 0.01:
            plotstring = "* p ~ %s"%round(p_value, 3)
        else:
            plotstring = "p ~ %s"%round(p_value, 3)
        plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
        x_axis_counter += 1 # increment during the loop

    filename = 'boxplot_H_%s.png'%(data_col)
    filepath = os.path.join("R_pal_images", filename)
    plt.savefig(filepath, dpi=300)
    #plt.show()

In [35]:
#### Figure 2C - split out metabolic and other


df_3C = df_3A.copy()
df_3C['COG_meta'].fillna('n',inplace=True)
COG_meta_labels = ['M', 'T', 'O', 'n'] # the .unique is giving me problems because it is coming up with 'nan'
COG_meta_labels.sort() #essential to sort to make sure that the looping matches the image
essential = df_3C[df_3C['Essential_longevity'] == True]
nonessential = df_3C[df_3C['Essential_longevity'] == False]

for data_col in experiments:
    #now selecting out only one column
    plt.close('all') # this should close figures from previous runs of this cell
    plt.figure(figsize=(20,10))
    y_top = 0.10
    y_bottom = 0
    plt.ylim(y_bottom, y_top)
    #it's pretty important that the 'data' below be sorted by COG_category because that's
    #how we know that it's lined up with the 'things_in_M' array
    sns.boxplot(x='COG_meta', y=data_col, hue='Essential_longevity', data=df_3C.sort_values(by="COG_meta"), showfliers=False)
    
    print ("experiment: %s" %data_col)
    x_axis_counter = 0
    for COG_m in COG_meta_labels:
        #here I'm trying to slice out data to get a simple list to send into the T-test
        #when I find a better way of doing this, I will certainly do so
        # for a given Cog meta-category (e.g. M) and experimental dataset (e.g. anaerobic)
        e = essential[essential['COG_meta'] == COG_m][data_col]
        n = nonessential[nonessential['COG_meta'] == COG_m][data_col]
        (t_stat, p_value) = stats.mannwhitneyu(n, e, alternative='two-sided')
        print ("COG_m: %s \tpval: %s \t num_ess: %s \t num_non: %s"%(COG_m, round(p_value, 3), len(e), len(n)))
        #set up some plotting coordinates
        x = x_axis_counter # figure out how to get the right x column
        y = y_top - 0.025 # height of the horizontal line. 15 units above the highest value
        h=0.002 # height of the U that are are building
        black = 'k' # the color black
        plt.plot([x-0.25, x-0.25, x+0.25, x+0.25], [y, y+h, y+h, y], lw = 1.5, c=black)
        if p_value < 0.01:
            plotstring = "* %s"%round(p_value, 3)
        else:
            plotstring = "p ~ %s"%round(p_value, 3)
        plt.text(x, y+h, plotstring, ha='center', va='bottom', color=black)
        x_axis_counter += 1 # increment during the loop
    filename = 'boxplot_I_%s.png'%(data_col)
    filepath = os.path.join("R_pal_images", filename)
    plt.savefig(filepath, dpi=300)


experiment: aerobic
COG_m: M 	pval: 0.428 	 num_ess: 23 	 num_non: 696
COG_m: O 	pval: 0.148 	 num_ess: 15 	 num_non: 396
COG_m: T 	pval: 0.805 	 num_ess: 7 	 num_non: 158
COG_m: n 	pval: 0.931 	 num_ess: 26 	 num_non: 634
experiment: N_fixing
COG_m: M 	pval: 0.161 	 num_ess: 23 	 num_non: 696
COG_m: O 	pval: 0.495 	 num_ess: 15 	 num_non: 396
COG_m: T 	pval: 0.469 	 num_ess: 7 	 num_non: 158
COG_m: n 	pval: 0.424 	 num_ess: 26 	 num_non: 634
experiment: phototrophic
COG_m: M 	pval: 0.879 	 num_ess: 23 	 num_non: 696
COG_m: O 	pval: 0.566 	 num_ess: 15 	 num_non: 396
COG_m: T 	pval: 0.952 	 num_ess: 7 	 num_non: 158
COG_m: n 	pval: 0.124 	 num_ess: 26 	 num_non: 634
