# Staphylococcus aureus Modulome

In [48]:
from pymodulon.core import IcaData
from pymodulon.io import *
from pymodulon.imodulondb import *
from pymodulon.util import *
import pandas as pd
from os import path

In [2]:
ica_data = load_json_model(path.join('..','data','saureus_imodulondb.json.gz'))

In [None]:
drop_duplicates = ['Project', 'Notes', 'Treatment']
ica_data.sample_table = ica_data.sample_table.drop(drop_duplicates, axis=1)

In [27]:
pd.set_option('display.max_colwidth', None)

table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,Sample,n_replicates,This column will be generated for you.
1,Sample,doi,Clicking on activity plot bars will not link to relevant papers for the samples.
2,iModulon,name,imodulon_table.index will be used.
3,iModulon,n_genes,This column will be computed for you.
4,iModulon,exp_var,This column will be left blank.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
FruR,True,False,False
Genomic Island 9,True,False,False
Purine,True,False,False
TPP,True,False,False
PurR,True,False,True
VraS2,True,False,False
CcpA,True,False,True
CggR,True,False,False
Rex,True,False,False
FMN,True,False,False


--Missing Gene Links--


array(['USA300HOU_RS14890', 'USA300HOU_RS14895', 'USA300HOU_RS14900', ...,
       'USA300HOU_RS14875', 'USA300HOU_RS14880', 'USA300HOU_RS14885'],
      dtype=object)

--Missing DOIs--


array(['SRR21699866', 'SRR21699859', 'SRR21699858', 'SRR21699857',
       'SRR21699856', 'SRR21699855', 'SRR21699854', 'SRR21699853',
       'SRR21699852', 'SRR21699851', 'SRR21699864', 'SRR21699863',
       'SRR21699862', 'SRR21699861', 'SRR21699860', 'SRR21699865', 'SG_1',
       'SG_2', 'SG_3', 'SG_4', 'SG_5', 'SG_6', 'SG_7', 'SG_8',
       'SRX2722028', 'SRX2722029', 'SRX2722030', 'SRX2722021',
       'SRX2722022', 'SRX2722020', 'ERX2284917', 'ERX2284918',
       'ERX2284921', 'ERX2284922', 'ERX2284919', 'ERX2284920',
       'ERX2284915', 'ERX2284916', 'SRX9960955', 'SRX9960956',
       'SRX9960957', 'SRX9960958', 'SRX9960961', 'SRX9960962',
       'SRX9960963', 'SRX9960964', 'SRX9960951', 'SRX9960952',
       'SRX9960959', 'SRX9960960', 'SRX9960953', 'SRX9960954',
       'SRX9960965', 'SRX9960966', 'ERX5228512', 'ERX5228513',
       'ERX5228514', 'ERX5228516', 'ERX5228517', 'SRX4910578',
       'SRX4910579', 'SRX4910580', 'SRX4910575', 'SRX4910576',
       'SRX4910577', 'SRX687799

## Add the iModulonDB Table

In [28]:
ica_data.imodulondb_table  = {'organism': 'Staphylococcus aureus',
 'dataset': 'Modulome',
 'strain': 'USA300',
 'publication_name': '',
 'publication_link': 'https://journals.asm.org/doi/full/10.1128/msystems.00480-22',
 'gene_link_db': '',
 'organism_folder': 'Staphylococcus aureus',
 'dataset_folder': 'Coordination of CcpA and CodY Regulators in Staphylococcus aureus USA300 Strains'}

## Add the Explained Variance

In [29]:
for k in ica_data.imodulon_table.index:
    ica_data.imodulon_table.loc[k, 'exp_var'] = explained_variance(
        ica_data, imodulons=k)

## Links

## Sample Table

In [52]:
# add replicate numbers
for name, group in ica_data.sample_table.groupby(['full_name']):
    ica_data.sample_table.loc[group.index, 'replicate'] = range(1, group.shape[0]+1)
    
# make sample names that include replicate number
ica_data.sample_table = ica_data.sample_table.rename({'full_name':'full_condition_name'}, axis = 1)
ica_data.sample_table['name'] = \
    ica_data.sample_table.full_condition_name + ':' + \
    ica_data.sample_table.replicate.astype(int).astype(str)

# important: adjust naming to this new column
ica_data.sample_table = ica_data.sample_table.rename({
    'Sample': 'SRS_Sample',
    'name': 'sample'
    }, axis = 1)

## Double-check Compatibility

In [65]:
table_issues, tf_issues, missing_gene_links, missing_dois = \
    imodulondb_compatibility(ica_data)

print('--Table Issues--')
display(table_issues)
print('--TF Issues--')
display(tf_issues)
print('--Missing Gene Links--')
display(missing_gene_links.values)
print('--Missing DOIs--')
display(missing_dois.values)

--Table Issues--


Unnamed: 0,Table,Missing Column,Solution
0,Sample,n_replicates,This column will be generated for you.
1,Sample,doi,Clicking on activity plot bars will not link to relevant papers for the samples.
2,iModulon,name,imodulon_table.index will be used.
3,iModulon,n_genes,This column will be computed for you.


--TF Issues--


Unnamed: 0,in_trn,has_link,has_gene
FruR,True,False,False
Genomic Island 9,True,False,False
Purine,True,False,False
TPP,True,False,False
PurR,True,False,True
VraS2,True,False,False
CcpA,True,False,True
CggR,True,False,False
Rex,True,False,False
FMN,True,False,False


--Missing Gene Links--


array(['USA300HOU_RS14890', 'USA300HOU_RS14895', 'USA300HOU_RS14900', ...,
       'USA300HOU_RS14875', 'USA300HOU_RS14880', 'USA300HOU_RS14885'],
      dtype=object)

--Missing DOIs--


array(['SRR21699866', 'SRR21699859', 'SRR21699858', 'SRR21699857',
       'SRR21699856', 'SRR21699855', 'SRR21699854', 'SRR21699853',
       'SRR21699852', 'SRR21699851', 'SRR21699864', 'SRR21699863',
       'SRR21699862', 'SRR21699861', 'SRR21699860', 'SRR21699865', 'SG_1',
       'SG_2', 'SG_3', 'SG_4', 'SG_5', 'SG_6', 'SG_7', 'SG_8',
       'SRX2722028', 'SRX2722029', 'SRX2722030', 'SRX2722021',
       'SRX2722022', 'SRX2722020', 'ERX2284917', 'ERX2284918',
       'ERX2284921', 'ERX2284922', 'ERX2284919', 'ERX2284920',
       'ERX2284915', 'ERX2284916', 'SRX9960955', 'SRX9960956',
       'SRX9960957', 'SRX9960958', 'SRX9960961', 'SRX9960962',
       'SRX9960963', 'SRX9960964', 'SRX9960951', 'SRX9960952',
       'SRX9960959', 'SRX9960960', 'SRX9960953', 'SRX9960954',
       'SRX9960965', 'SRX9960966', 'ERX5228512', 'ERX5228513',
       'ERX5228514', 'ERX5228516', 'ERX5228517', 'SRX4910578',
       'SRX4910579', 'SRX4910580', 'SRX4910575', 'SRX4910576',
       'SRX4910577', 'SRX687799

## Save & Export

In [86]:
ica_data.imodulon_table['function'] = ica_data.imodulon_table.function.replace('Single gene', 'Single Gene')
ica_data.imodulon_table['category'] = ica_data.imodulon_table['function']
ica_data.imodulon_table = ica_data.imodulon_table.drop('function', axis=1)

In [87]:
categories = [
    'Carbon Metabolism',
    'AA/Nucleotide Metabolism',
    'Misc. Metabolism',
    'Homeostasis',
    'Lifestyles',
    'Cellular Processes',
    'Stress Response',
    'Prophages',
    'Other',
    'Single Gene',
    'Uncharacterized',
    'Virulence'
]

In [88]:
pd.set_option('mode.chained_assignment',None)
imodulondb_export(ica_data, '../iModulonDB', cat_order = categories)

Writing main site files...
Done writing main site files. Writing plot files...
Two progress bars will appear below. The second will take significantly longer than the first.
Writing iModulon page files (1/2)


  0%|          | 0/76 [00:00<?, ?it/s]

Writing Gene page files (2/2)


  0%|          | 0/2858 [00:00<?, ?it/s]

Complete! (Organism = Staphylococcus aureus; Dataset = Coordination of CcpA and CodY Regulators in Staphylococcus aureus USA300 Strains)


In [90]:
save_to_json(ica_data, path.join('..','data','saureus_imodulondb.json.gz'))