## Notebook Outline

1. import statements
2. merge meta data and omics data
3. Clustering and images

In [1]:
### Step 1 - import a bunch of libraries
from pandas import DataFrame, read_table
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import math
from scipy import stats

sns.set(style="whitegrid", color_codes=True)


In [2]:
#### Step 2 - merging the metadata and the omics data
# In these images we are looking at individual protein abundance
# and whether these change dramatically by condition. Therefore
# we are using the LFQ measurements from MaxQuant.  

R_pal_omics_file = "R_pal_aux_files\R_pal_LFQ_Foldchange.txt"
df_omics = pd.read_table(R_pal_omics_file, sep="\t")
df_omics.drop_duplicates(inplace=True)#added security measure
new_names = {'nonnitro_anaerobic_1':'phototrophic_1', 'nitro_anaerobic_1':'N_fixing_1',
            'nonnitro_anaerobic_2':'phototrophic_2', 'nitro_anaerobic_2':'N_fixing_2',
            'nonnitro_anaerobic_3':'phototrophic_3', 'nitro_anaerobic_3':'N_fixing_3',}
df_omics.rename(index=str, columns=new_names, inplace=True)
R_pal_metadata_file = "R_pal_metadata_df.txt"
df_metadata = pd.read_table(R_pal_metadata_file, sep='\t')

#the figure we are trying to develop shows that essential genes have variable expression. 
#so the only real meta-data that I want to keep is the essentiality stuff
to_keep_meta = ['RefSeq', 'Essential_aerobic', 'Essential_phototrophic', 'Essential_longevity']
df_temp = df_metadata[to_keep_meta]
df_metadata_slim = df_temp.drop_duplicates()


#df_2 for figure 2
df_2 = df_metadata_slim.merge(df_omics, left_on='RefSeq',right_on='Protein IDs', how='right')
df_2.drop('RefSeq', axis=1, inplace=True)
### Have to fill in NaN because it really messes up plot engines
df_2['Essential_aerobic'].fillna(False, inplace=True)
df_2['Essential_phototrophic'].fillna(False, inplace=True)
df_2['Essential_longevity'].fillna(False, inplace=True)

##### we are leaving df_2 alone. If you use it for later plotting,
##### copy it out to df_2A or something helpful like that


In [4]:
#Step 3 - clustering
# to use Seaborn's clustering, I need to strip out all the columns
# except the molecular measurements
df_2A = df_2.copy()
essential_aerobic = df_2A.pop('Essential_aerobic')
essential_photo = df_2A.pop('Essential_phototrophic')
essential_long = df_2A.pop('Essential_longevity')
#drop the rest of the columns that are not used
df_2A.drop('Protein IDs', axis=1, inplace=True)
df_2A.head()

Unnamed: 0,aerobic_1,aerobic_2,aerobic_3,N_fixing_1,N_fixing_2,N_fixing_3,phototrophic_1,phototrophic_2,phototrophic_3
0,3.780718,4.085548,4.042992,3.00375,-2.982602,-2.982602,-2.982602,-2.982602,-2.982602
1,-0.094884,-0.25553,-0.128221,0.264246,0.309489,0.189894,-0.259114,0.001318,-0.027198
2,0.001257,0.174495,0.057483,-0.373786,-0.280439,-0.239156,0.303335,0.289388,0.067423
3,4.147371,4.070155,4.118913,-5.979446,-5.979446,-5.979446,0.818288,2.346585,2.437026
4,-0.611919,-0.450059,-0.393306,0.444857,0.562133,0.638144,0.033024,-0.069529,-0.153346


In [5]:
##### Step 4 making the actual figure
# creating a coloring of rows, whether they are essential or not.  blue is essential.
lut = dict(zip(essential_aerobic.unique(), 'bw'))
c1= list(essential_aerobic.map(lut))
c2 = list(essential_photo.map(lut))
c3 = list(essential_long.map(lut))
many_labels=(c1, c2, c3) # embeded list so that we can do many colors

# how do I set this?  sns.color_palette("coolwarm", 7)
# does not work: 
#cmap = color palette, using the blue to red transition through white
#robust = True, meaning please ignore outliers in the color scaling
g = sns.clustermap(df_2A, figsize=(9,25), row_colors=many_labels, robust=True, cmap="RdBu_r") #need to pass in the size arguments
filename = "heatmap_A.png"
filepath = os.path.join("R_pal_images", filename)
plt.savefig(filepath, dpi=300)
#plt.show()

In [5]:
##### Afterwards ###########
# I hve not yet figured out how to remove the index legend from the right side
# of the figure. So there is a slight editing that will happen in photoshop
# after this. But the figure data is the same

In [6]:
##### Step 5 - making a figure with only differential proteins
## calculate p-values
df_2B = df_2.copy()

p_values = []
for index, row in df_2B.iterrows():
    l = list(row[4:]) # ignore the first four columns - they are not omics measurements
    aerobic = l[:3]
    nitro = l[3:6]
    photo = l[6:]
    #comparisons
    (t_stat, p1) = stats.ttest_ind(aerobic, nitro, equal_var=False)
    (t_stat, p2) = stats.ttest_ind(aerobic, photo, equal_var=False)
    (t_stat, p3) = stats.ttest_ind(photo, nitro, equal_var=False)
    ### for some proteins, we get arrays that are identical, which messes up the t-test.
    ### this happens when all values are imputed (to the same thing). Stupid imputation.
    ### there will be a warning for that. so 'pink text' below!
    if math.isnan(p1):
        p1 = 1 # just set it to some terrible pvalue
    if math.isnan(p2):
        p2 = 1 # just set it to some terrible pvalue
    if math.isnan(p3):
        p3 = 1 # just set it to some terrible pvalue
    p_min = min([p1, p2, p3])
    p_values.append({'index':index, 'p_aerobic_nitro':p1, 'p_aerobic_photo':p2, 'p_photo_nitro':p3, 'p_min':p_min})
    #break

df_pvalue = pd.DataFrame(data=p_values)
df_pvalue.set_index('index', inplace=True) # just to be really sure

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [7]:
#now merge that into my data table
df_2B_with_p = df_2B.merge(df_pvalue, left_index=True, right_index=True) # use index to merge
# get rid of things that don't have a good pvalue
cutoff = 0.01
df_2B_sig = df_2B_with_p[df_2B_with_p.p_min < cutoff]
#now remove things back down to just the measurements
df_2B_for_plot = df_2B_sig.drop(["p_aerobic_nitro", 'p_aerobic_photo', 'p_photo_nitro', 'p_min', 'Protein IDs'], axis=1)
df_2B_for_plot.head()


Unnamed: 0,Essential_aerobic,Essential_phototrophic,Essential_longevity,aerobic_1,aerobic_2,aerobic_3,N_fixing_1,N_fixing_2,N_fixing_3,phototrophic_1,phototrophic_2,phototrophic_3
0,True,False,False,3.780718,4.085548,4.042992,3.00375,-2.982602,-2.982602,-2.982602,-2.982602,-2.982602
1,True,False,False,-0.094884,-0.25553,-0.128221,0.264246,0.309489,0.189894,-0.259114,0.001318,-0.027198
2,True,False,False,0.001257,0.174495,0.057483,-0.373786,-0.280439,-0.239156,0.303335,0.289388,0.067423
3,False,False,False,4.147371,4.070155,4.118913,-5.979446,-5.979446,-5.979446,0.818288,2.346585,2.437026
4,False,False,False,-0.611919,-0.450059,-0.393306,0.444857,0.562133,0.638144,0.033024,-0.069529,-0.153346


In [8]:
essential_aerobic = df_2B_for_plot.pop('Essential_aerobic')
essential_photo = df_2B_for_plot.pop('Essential_phototrophic')
essential_long = df_2B_for_plot.pop('Essential_longevity')

# creating a coloring of rows, whether they are essential or not.  blue is essential.
lut = dict(zip(essential_aerobic.unique(), 'bw'))
c1= list(essential_aerobic.map(lut))
c2 = list(essential_photo.map(lut))
c3 = list(essential_long.map(lut))
many_labels=(c1, c2, c3) # embeded list so that we can do many colors

# how do I set this?  sns.color_palette("coolwarm", 7)
# does not work: 
#cmap = color palette, using the blue to red transition through white
#robust = True, meaning please ignore outliers in the color scaling
g = sns.clustermap(df_2B_for_plot, figsize=(9,25), row_colors=many_labels, robust=True, cmap="RdBu_r") #need to pass in the size arguments
filename = "heatmap_B.png"
filepath = os.path.join("R_pal_images", filename)
plt.savefig(filepath, dpi=300)
#plt.show()

In [9]:
(nrows, ncols) = df_2B_for_plot.shape # after filtering for significant difference
(nrows_original, ncols) = df_2.shape # original data frame

print (nrows, nrows_original)

1191 1560


In [29]:
######now only plot those that are essential (and significantly changing)
df_2C= df_2.copy()

#step 1 - create a new column of 'essential somewhere'
def ever_essential(row):
    #return a code if something is ever essential
    # 'Essential_aerobic', 'Essential_phototrophic', 'Essential_longevity'
    if row.Essential_aerobic == True:
        return 'aerobic'
    if row.Essential_phototrophic == True:
        return 'phototrophic'
    if row.Essential_longevity == True:
        return 'longevity'
    return 'non-essential'

df_2C['essentiality'] = df_2C.apply(ever_essential, axis=1) # send every row to a function to get the value for a new column
df_2C.drop(df_2C[df_2C.essentiality=='non-essential'].index, inplace=True)



In [30]:
#significantly changing
def changing(row):
    aerobic = row['aerobic_1'], row['aerobic_2'], row['aerobic_3']
    nitro = row['N_fixing_1'], row['N_fixing_2'], row['N_fixing_3']
    photo = row['phototrophic_1'], row['phototrophic_2'], row['phototrophic_3']
    #comparisons
    (t_stat, p1) = stats.ttest_ind(aerobic, nitro, equal_var=False)
    (t_stat, p2) = stats.ttest_ind(aerobic, photo, equal_var=False)
    (t_stat, p3) = stats.ttest_ind(photo, nitro, equal_var=False)
    ### for some proteins, we get arrays that are identical, which messes up the t-test.
    ### this happens when all values are imputed (to the same thing). Stupid imputation.
    ### there will be a warning for that. so 'pink text' below!
    if math.isnan(p1):
        p1 = 1 # just set it to some terrible pvalue
    if math.isnan(p2):
        p2 = 1 # just set it to some terrible pvalue
    if math.isnan(p3):
        p3 = 1 # just set it to some terrible pvalue
    p_min = min([p1, p2, p3])
    if p_min < 0.01:
        return True
    return False

df_2C['Sig_diff'] = df_2C.apply(changing, axis=1)
df_2C.drop(df_2C[df_2C.Sig_diff== False].index, inplace=True)
#lastly remove some extra columns that I can't use in plotting
df_2C_for_plot = df_2C.drop(['Sig_diff', 'essentiality', 'Protein IDs'], axis=1)

  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [31]:
essential_aerobic = df_2C_for_plot.pop('Essential_aerobic')
essential_photo = df_2C_for_plot.pop('Essential_phototrophic')
essential_long = df_2C_for_plot.pop('Essential_longevity')

# creating a coloring of rows, whether they are essential or not.  blue is essential.
lut = dict(zip(essential_aerobic.unique(), 'bw'))
c1= list(essential_aerobic.map(lut))
c2 = list(essential_photo.map(lut))
c3 = list(essential_long.map(lut))
many_labels=(c1, c2, c3) # embeded list so that we can do many colors

# how do I set this?  sns.color_palette("coolwarm", 7)
# does not work: 
#cmap = color palette, using the blue to red transition through white
#robust = True, meaning please ignore outliers in the color scaling
g = sns.clustermap(df_2C_for_plot, figsize=(9,25), row_colors=many_labels, robust=True, cmap="RdBu_r") #need to pass in the size arguments
filename = "heatmap_C.png"
filepath = os.path.join("R_pal_images", filename)
plt.savefig(filepath, dpi=300)