Skip to content

Commit

Permalink
RFCT Output runlog in Yaml format
Browse files Browse the repository at this point in the history
This also allows for a complete refactoring of the code that generated
the log and eliminate a lot of repetition.
  • Loading branch information
luispedro committed Jun 12, 2020
1 parent 797caa1 commit 5fba323
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 108 deletions.
13 changes: 11 additions & 2 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,16 @@ Note that GMGC unigenes can while not all GMGC unigenes are contained in a
genome bin, some are contained in many. Thus, the total counts will not (except
by coincidence) correspond to the number of genes queried.

## Summary (`summary.txt`)
## Summary (`summary.txt` and `runlog.yaml`)

The file `summary.txt` provides a human-readable summary of the results, while
`runlog.yaml` is a summary of run (as a YaML file, it is both machine and
human-readable).


The file `summary.txt` should be reproducible and running GMGC-finder twice on
the same input should produce the same results. By design, though,
`runglog.yaml` includes information on timing and, thus, is not reproducible.


Human-readable summary of the results.

137 changes: 31 additions & 106 deletions gmgc_finder/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from pkg_resources import resource_string
import datetime
import time
import hashlib
import yaml
from .alignment import identity_coverage
from .gmgc_finder_version import __version__

Expand Down Expand Up @@ -207,13 +207,24 @@ def query_genome_bin(hit_table):
genome_bin = genome_bin.reset_index().rename(columns={'index':'genome_bin'})
return genome_bin

def CalSha256(filname):
def sha256sum(filname):
import hashlib
with open(filname, "rb") as f:
sha256obj = hashlib.sha256()
sha256obj.update(f.read())
hash_value = sha256obj.hexdigest()
return hash_value

def input_metadata(fpath):
return {
'input-path': fpath,
'full_path': os.path.abspath(fpath),
'mtime': str(time.ctime(os.path.getmtime(fpath))),
'file-size': os.path.getsize(fpath),
'sha256': sha256sum(fpath)
}


def convert_command(command):
command_line = 'gmgc-finder '
for parameter in command:
Expand Down Expand Up @@ -331,113 +342,27 @@ def main(args=None):

end = datetime.datetime.now()

output_log = []
output_dict = {}
output_log.append('Command_line: '+command_line)
output_log.append('GMGC-Finder: '+ __version__)

output_log.append('Start time: '+str(start))
output_log.append('End time: '+str(end))
output_log.append('Run time: '+str((end-start).seconds))

output_dict['Command_line'] = command_line
output_dict['GMGC-Finder'] = __version__
output_dict['Start time'] = str(start)
output_dict['End time'] = str(end)
output_dict['Run time'] = str((end-start).seconds)
run_metadata = {}

output_log.append('\n# Inputs')
run_metadata['Command_line'] = command_line
run_metadata['GMGC-Finder'] = __version__
run_metadata['Start time'] = str(start)
run_metadata['End time'] = str(end)
run_metadata['Run time'] = (end-start).seconds
run_metadata['Inputs'] = []

if args.genome_fasta is not None:
input_name = os.path.basename(args.genome_fasta)
full_path = os.path.abspath(args.genome_fasta)
mtime = time.ctime(os.path.getmtime(args.genome_fasta))
file_size = str(os.path.getsize(args.genome_fasta))
sha256 = CalSha256(args.genome_fasta)
output_log.append('-input_name: '+input_name)
output_log.append('-full_path: '+full_path)
output_log.append('-mtime: '+mtime)
output_log.append('-file size: '+ file_size)
output_log.append('-sha256 '+ sha256)
output_dict['#Inputs'] = {}
output_dict['#Inputs']['input_name'] = input_name
output_dict['#Inputs']['full_path'] = full_path
output_dict['#Inputs']['mtime'] = str(mtime)
output_dict['#Inputs']['file size'] = file_size
output_dict['#Inputs']['sha256'] = sha256
else:
if args.nt_input is not None:
input_name_nt = os.path.basename(args.nt_input)
full_path_nt = os.path.abspath(args.nt_input)
mtime_nt = time.ctime(os.path.getmtime(args.nt_input))
file_size_nt = str(os.path.getsize(args.nt_input))
sha256_nt = CalSha256(args.nt_input)

input_name_aa = os.path.basename(args.aa_input)
full_path_aa = os.path.abspath(args.aa_input)
mtime_aa = time.ctime(os.path.getmtime(args.aa_input))
file_size_aa = str(os.path.getsize(args.aa_input))
sha256_aa = CalSha256(args.aa_input)

output_log.append('-nt_input_name: '+input_name_nt)
output_log.append('-nt_full_path_nt: '+full_path_nt)
output_log.append('-nt_mtime: '+mtime_nt)
output_log.append('-nt_file size: '+ file_size_nt)
output_log.append('-nt_sha256 '+ sha256_nt+'\n')

output_log.append('-aa_input_name: '+input_name_aa)
output_log.append('-aa_full_path: '+full_path_aa)
output_log.append('-aa_mtime: '+mtime_aa)
output_log.append('-aa_file size: '+ file_size_aa)
output_log.append('-aa_sha256 '+ sha256_aa)

output_dict['#Inputs'] = {}

output_dict['#Inputs']['nt_input_name'] = input_name_nt
output_dict['#Inputs']['nt_full_path'] = full_path_nt
output_dict['#Inputs']['nt_mtime'] = str(mtime_nt)
output_dict['#Inputs']['nt_file size'] = file_size_nt
output_dict['#Inputs']['nt_sha256'] = sha256_nt

output_dict['#Inputs']['aa_input_name'] = input_name_aa
output_dict['#Inputs']['aa_full_path'] = full_path_aa
output_dict['#Inputs']['aa_mtime'] = str(mtime_aa)
output_dict['#Inputs']['aa_file size'] = file_size_aa
output_dict['#Inputs']['aa_sha256'] = sha256_aa

else:
input_name_aa = os.path.basename(args.aa_input)
full_path_aa = os.path.abspath(args.aa_input)
mtime_aa = time.ctime(os.path.getmtime(args.aa_input))
file_size_aa = str(os.path.getsize(args.aa_input))
sha256_aa = CalSha256(args.aa_input)
output_log.append('-aa_input_name: '+input_name_aa)
output_log.append('-aa_full_path: '+full_path_aa)
output_log.append('-aa_mtime: '+mtime_aa)
output_log.append('-aa_file size: '+ file_size_aa)
output_log.append('-aa_sha256 '+ sha256_aa)
output_dict['#Inputs'] = {}
output_dict['#Inputs']['aa_input_name'] = input_name_aa
output_dict['#Inputs']['aa_full_path'] = full_path_aa
output_dict['#Inputs']['aa_mtime'] = str(mtime_aa)
output_dict['#Inputs']['aa_file size'] = file_size_aa
output_dict['#Inputs']['aa_sha256'] = sha256_aa



with safeout(out+'/runlog.txt', 'wt') as ofile:
for s in output_log:
ofile.write(s+'\n')

with safeout(out+'/runlog.json','wt') as ofile:
output_json = json.dumps(output_dict)
ofile.write(output_json)






run_metadata['Inputs'].append(
{'genome_input': input_metadata(args.genome_fasta) })
if args.nt_input is not None:
run_metadata['Inputs'].append(
{'nt_input': input_metadata(args.nt_input)})
if args.aa_input is not None:
run_metadata['Inputs'].append(
{'aa_input': input_metadata(args.aa_input)})

with safeout(out+'/runlog.yaml', 'wt') as ofile:
yaml.dump(run_metadata, ofile, default_flow_style=False)


if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
'scikit-bio',
'safeout',
'tqdm',
'yaml',
],
package_data={
'gmgc_finder': ['*.md']},
Expand Down

0 comments on commit 5fba323

Please sign in to comment.