In [19]:
from pandas import DataFrame, read_table
import pandas as pd


In [24]:
R_pal_file = "R_pal_metadata_df.txt"
df_input = pd.read_table(R_pal_file, sep="\t")

#after reading in the meta-data table, you always have to keep only the rows you want
# and then remove duplicates.
to_keep_meta = ['RefSeq', 'Essential_aerobic', 'Essential_phototrophic', 'Essential_longevity', 
                'gene_product', 'Pfam_name', 'locus']
df_temp = df_input[to_keep_meta]
df = df_temp.drop_duplicates()

In [27]:
##### Identify PFams of unknown function
# Trying to identify if something has an UNKNOWN functional annotation
# put is some grep hacks: DUF from pfam or 'hypothetical' in the gene name

Unknowns_array = [] # I am using this iterator instead of apply because
#there are things with multiple pfams, and so they have multiple rows
# I was not sure how to properly condense something when the two rows
# of a protein may have two different answers. So here I am making a single
#array and then joining it back.

for index, row in df.iterrows():
    if pd.isnull(row['Pfam_name']): # some things don't have a pfam,
        continue # skip these things,
    Pfam = row['Pfam_name']
    if Pfam[:3] == 'DUF':
        locus = row['locus']
        Unknowns_array.append({'locus':locus, 'characterized_function':False})

for index, row in df.iterrows():
    gene = row['gene_product']
    locus = row['locus']
    if 'hypothetical' in gene:
        Unknowns_array.append({'locus':locus, 'characterized_function':False})

#now we non-redundify the list, as it may get the same locus from both 6.1 and 6.2
df_temp = pd.DataFrame(data=Unknowns_array)
df_temp.drop_duplicates(inplace=True)

#merge back
df = df.merge(df_temp, left_on='locus', right_on='locus', how='left')
#now the tricky part. all the things which ARE functionally defined were absent in df_temp
# so they are now 'NaN' or the default NULL for merge. These have to be changed back to 'True'
# meaning that yes, they do have a known function.
df['characterized_function'].fillna(True, inplace=True)

In [28]:
df

Unnamed: 0,RefSeq,Essential_aerobic,Essential_phototrophic,Essential_longevity,gene_product,Pfam_name,locus,characterized_function
0,WP_011155572.1,True,False,False,chromosomal replication initiator protein DnaA,,RPA0001,True
1,WP_011155573.1,True,False,False,DNA polymerase III subunit beta,,RPA0002,True
2,WP_042440644.1,False,False,False,DNA replication and repair protein RecF,,RPA0003,True
3,WP_011155575.1,True,False,False,DNA topoisomerase (ATP-hydrolyzing) subunit B,,RPA0004,True
4,WP_011155576.1,False,False,False,4-hydroxyphenylpyruvate dioxygenase,,RPA0005,True
5,WP_011155577.1,False,False,False,Lrp/AsnC family transcriptional regulator,AsnC_trans_reg,RPA0006,True
6,WP_011155578.1,False,False,False,histidine kinase,HWE_HK,RPA0007,True
7,WP_011155578.1,False,False,False,histidine kinase,PAS_4,RPA0007,True
8,WP_011155578.1,False,False,False,histidine kinase,PAS_8,RPA0007,True
9,WP_011155579.1,False,False,False,KaiB 1,,RPA0008,True


In [44]:
#now narrow down essential and unknown
def EssentialAndUnknown (row):
    essential = False
    if row['Essential_aerobic'] or row['Essential_phototrophic'] or row['Essential_longevity']:
        essential = True
    if (not row['characterized_function']) and essential:
        return True
    
df['Essential_unknown'] = df.apply(EssentialAndUnknown, axis=1)

#df[df.locus == 'RPA4831']
high_priority = df[df.Essential_unknown == True].copy()
high_priority.drop(['Essential_unknown', 'characterized_function'], axis=1, inplace=True)
#high_priority.to_csv("delme.txt", sep="\t")

In [46]:
#now merge in proteomics data
R_pal_file = "R_pal_aux_files\R_pal_LFQ_Foldchange.txt"
df_omics = pd.read_table(R_pal_file, sep="\t")
high_priority = high_priority.merge(df_omics, left_on='RefSeq', right_on='Protein IDs', how='left')
high_priority.drop('Protein IDs', axis=1, inplace=True)

In [48]:

high_priority.to_csv("delme2.txt", sep="\t")