Skip to content

Commit

Permalink
ENH Better error messages
Browse files Browse the repository at this point in the history
- Validate arguments early
- Catch exceptions and print them nicely
- Rename the arguments to kebab-case
- Avoid using `assert` for input checks
  • Loading branch information
luispedro committed Jun 14, 2020
1 parent 06ca148 commit 997118d
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 105 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ available for genome mode.

* `-o/--output`: Output directory (will be created if non-existent).

* `--nt_input`: path to the input DNA gene file(.fasta/.gz/.bz2).
* `--nt-genes`: path to the input DNA gene file(.fasta/.gz/.bz2).

* `--aa_input`: path to the input Protein gene file(.fasta/.gz/.bz2).
* `--aa-genes`: path to the input Protein gene file(.fasta/.gz/.bz2).

## Examples

Expand All @@ -38,14 +38,14 @@ gmgc-finder -i input.fasta -o output
2. Input is DNA/protein gene sequences

```bash
gmgc-finder --nt_input genes.fna --aa_input genes.faa -o output
gmgc-finder --nt-genes genes.fna --aa-genes genes.faa -o output
```

The nucleotide input is optional (but should be used if available so that the
quality of the hits can be refined):

```bash
gmgc-finder --aa_input genes.faa -o output
gmgc-finder --aa-genes genes.faa -o output
```

If yout input is a metagenome, you can use
Expand Down
8 changes: 4 additions & 4 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

* `-o/--output` : Output directory (will be created if non-existent).

* `--nt_input` : path to the input DNA gene file(.fasta/.gz/.bz2).
* `--nt-genes` : path to the input DNA gene file(.fasta/.gz/.bz2).

* `--aa_input` : path to the input Protein gene file(.fasta/.gz/.bz2).
* `--aa-genes` : path to the input Protein gene file(.fasta/.gz/.bz2).

The input must contain a genome file or DNA and Protein gene file or just Protein gene file.

Expand All @@ -26,10 +26,10 @@ GMGC-finder will call `prodigal` to predict genes and then process each gene.
respectfully).

```bash
gmgc-finder --nt_input genes.fna --aa_input genes.faa -o output
gmgc-finder --nt-genes genes.fna --aa-genes genes.faa -o output
```
```bash
gmgc-finder --aa_input genes.faa -o output
gmgc-finder --aa-genes genes.faa -o output
```
# Processing metagenomes using NGLess

Expand Down
214 changes: 117 additions & 97 deletions gmgc_finder/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,28 +37,40 @@ def parse_args(args):
help='Output directory (will be created if non-existent)',
dest='output',
default = None)
parser.add_argument('--nt_genes',
parser.add_argument('--nt-genes', '--nt_genes',
required=False,
help='Path to the input DNA gene file (FASTA format)',
dest='nt_input',
default=None)
parser.add_argument('--aa_genes',
parser.add_argument('--aa-genes', '--aa_genes',
required=False,
help='Path to the input amino acid gene file (FASTA format)',
dest='aa_input',
default=None)
return parser.parse_args()

def gene_prediction(fasta_input,output):
def validate_args(args):
def expect_file(f):
if f is not None:
if not os.path.exists(f):
sys.stderr.write(f"GMGC-finder Error: Expected file '{f}' does not exist\n")
sys.exit(1)
expect_file(args.genome_fasta)
expect_file(args.aa_input)
expect_file(args.nt_input)
if args.genome_fasta is None and args.aa_input is None:
sys.stderr.write("GMGC-finder Error: At least one of --input or --aa-genes is necessary\n")
sys.stderr.exit(1)

def gene_prediction(fasta_input, output, tmpdirname):

print('Start gene prediction...')


if os.path.splitext(fasta_input)[1] == '.bz2':
output_file = bz2.BZ2File(fasta_input)
open(output + '/input.fasta', "wb+").write(output_file.read())
output_file.close()
fasta_input = output + '/input.fasta'
with bz2.BZ2File(fasta_input) as ifile:
open(tmpdirname + '/input.fasta', "wb+").write(output_file.read())
fasta_input = tmpdirname + '/input.fasta'


subprocess.check_call(
Expand Down Expand Up @@ -251,6 +263,7 @@ def main(args=None):
if args is None:
args = sys.argv
args = parse_args(args)
validate_args(args)

command_args = vars(args)
command_line = convert_command(command_args)
Expand All @@ -259,20 +272,22 @@ def main(args=None):
os.makedirs(out)

with tempfile.TemporaryDirectory() as tmpdirname:
if args.genome_fasta is not None:
gene_prediction(args.genome_fasta,out)

split_file(out + '/prodigal_out.faa',
output_dir=tmpdirname + '/split_file',is_dna=False)
num_split = split_file(out + '/prodigal_out.fna',
output_dir=tmpdirname + '/split_file',is_dna=True)
else:
if args.aa_input is None:
raise Exception("Need to input protein gene file or a genome file!")
try:
if args.genome_fasta is not None:
gene_prediction(args.genome_fasta, out, tmdirdname)

split_file(out + '/prodigal_out.faa',
output_dir=tmpdirname + '/split_file',is_dna=False)
num_split = split_file(out + '/prodigal_out.fna',
output_dir=tmpdirname + '/split_file',is_dna=True)
else:
if args.nt_input is not None:
assert gene_num(args.nt_input) == gene_num(args.aa_input), "Input dna and protein gene file must have the same sequence number."
n_nt = gene_num(args.nt_input)
n_aa = gene_num(args.aa_input)
if n_nt != n_aa:
sys.stderr.write("Input DNA and amino acide gene files must have the same sequence number!\n")
sys.stderr.write(f"DNA file has {n_nt} while amino acid file has {n_aa}!")
sys.exit(1)
split_file(args.aa_input,
output_dir=tmpdirname + '/split_file',is_dna=False)
num_split = split_file(args.nt_input,
Expand All @@ -281,103 +296,108 @@ def main(args=None):
else:
num_split = split_file(args.aa_input,
output_dir=tmpdirname + '/split_file',is_dna=False)
hit_table = []
print('Starting GMGC queries (total: {} batches to process)'.format(num_split))
for index in tqdm(range(num_split)):
besthit = query_gmgc(tmpdirname+'/split_file/split_{}.faa'.format(index+1))
if besthit is not None:
besthit = json.loads(bytes.decode(besthit.content))['results']
if args.nt_input is not None:
hit_table_index = realignment(tmpdirname+'/split_file/split_{}.fna'.format(index+1),
tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
else:
hit_table_index = realignment(None,tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
hit_table.extend(hit_table_index)
hit_table = pd.DataFrame(hit_table)
hit_table.columns = ['query_name','gene_id','align_category','gene_dna','gene_protein']
num_gene = hit_table.shape[0]


summary = []
summary.append('*'*30+'GMGC-Finder results summary table'+'*'*30)
summary.append('- Processed {} genes'.format(num_gene))
match_result = hit_table['align_category'].value_counts().to_dict()
if 'EXACT' in match_result:
summary.append(' -{0} ({1:.1%}) were found in the GMGC at above 95% nucleotide identity with at least 95% coverage'
.format(match_result['EXACT'], match_result['EXACT']/num_gene))
else:
summary.append(' -No genes were found in the GMGC at above 95% nucleotide identity with at least 95% coverage')
hit_table = []
print('Starting GMGC queries (total: {} batches to process)'.format(num_split))
for index in tqdm(range(num_split)):
besthit = query_gmgc(tmpdirname+'/split_file/split_{}.faa'.format(index+1))
if besthit is not None:
besthit = json.loads(bytes.decode(besthit.content))['results']
if args.nt_input is not None:
hit_table_index = realignment(tmpdirname+'/split_file/split_{}.fna'.format(index+1),
tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
else:
hit_table_index = realignment(None,tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
hit_table.extend(hit_table_index)
hit_table = pd.DataFrame(hit_table)
hit_table.columns = ['query_name','gene_id','align_category','gene_dna','gene_protein']
num_gene = hit_table.shape[0]


summary = []
summary.append('*'*30+'GMGC-Finder results summary table'+'*'*30)
summary.append('- Processed {} genes'.format(num_gene))
match_result = hit_table['align_category'].value_counts().to_dict()
if 'EXACT' in match_result:
summary.append(' -{0} ({1:.1%}) were found in the GMGC at above 95% nucleotide identity with at least 95% coverage'
.format(match_result['EXACT'], match_result['EXACT']/num_gene))
else:
summary.append(' -No genes were found in the GMGC at above 95% nucleotide identity with at least 95% coverage')

if 'SIMILAR' in match_result:
summary.append(' -{0} ({1:.1%}) were found in the GMGC at above 80% nucleotide identity with at least 80% coverage'
.format(match_result['SIMILAR'], match_result['SIMILAR']/num_gene))
else:
summary.append(' -No genes were found in the GMGC at above 80% nucleotide identity with at least 80% coverage')
if 'SIMILAR' in match_result:
summary.append(' -{0} ({1:.1%}) were found in the GMGC at above 80% nucleotide identity with at least 80% coverage'
.format(match_result['SIMILAR'], match_result['SIMILAR']/num_gene))
else:
summary.append(' -No genes were found in the GMGC at above 80% nucleotide identity with at least 80% coverage')

if 'MATCH' in match_result:
summary.append(' -{0} ({1:.1%}) were found in the GMGC at above 50% nucleotide identity with at least 50% coverage'
.format(match_result['MATCH'], match_result['MATCH']/num_gene))
else:
summary.append(' -No genes were found in the GMGC at above 50% nucleotide identity with at least 50% coverage')
if 'MATCH' in match_result:
summary.append(' -{0} ({1:.1%}) were found in the GMGC at above 50% nucleotide identity with at least 50% coverage'
.format(match_result['MATCH'], match_result['MATCH']/num_gene))
else:
summary.append(' -No genes were found in the GMGC at above 50% nucleotide identity with at least 50% coverage')

no_match = match_result.get('NO MATCH', 0.0) + match_result.get('NO HIT', 0.0)
if no_match:
summary.append(' -{0} ({1:.1%}) had no match in the GMGC'
.format(no_match, no_match/num_gene))
no_match = match_result.get('NO MATCH', 0.0) + match_result.get('NO HIT', 0.0)
if no_match:
summary.append(' -{0} ({1:.1%}) had no match in the GMGC'
.format(no_match, no_match/num_gene))


genome_bin = query_genome_bin(hit_table)
genome_bin = genome_bin.sort_values('times_gene_hit',ascending=False)
summary.append('\n\n'+'*' * 30 + 'GMGC-Finder results genome_bin summary' + '*' * 30+'\n')
genome_bin = query_genome_bin(hit_table)
genome_bin = genome_bin.sort_values('times_gene_hit',ascending=False)
summary.append('\n\n'+'*' * 30 + 'GMGC-Finder results genome_bin summary' + '*' * 30+'\n')

num_hitting = genome_bin['times_gene_hit'].values
summary.append('{} bins were reported for >50% of genes'.format(np.sum(num_hitting > num_gene*0.5)))
summary.append('{} bins were reported for >25% of genes'.format(np.sum(num_hitting > num_gene*0.25)))
summary.append('{} bins were reported for >10% of genes'.format(np.sum(num_hitting > num_gene*0.1)))
num_hitting = genome_bin['times_gene_hit'].values
summary.append('{} bins were reported for >50% of genes'.format(np.sum(num_hitting > num_gene*0.5)))
summary.append('{} bins were reported for >25% of genes'.format(np.sum(num_hitting > num_gene*0.25)))
summary.append('{} bins were reported for >10% of genes'.format(np.sum(num_hitting > num_gene*0.1)))



with atomic_write(out+'/genome_bin.tsv', overwrite=True) as ofile:
ofile.write('# Genome_bin from GMGC-Finder v{}\n'.format(__version__))
genome_bin.to_csv(ofile, sep='\t', index=False)
with atomic_write(out+'/genome_bin.tsv', overwrite=True) as ofile:
ofile.write('# Genome_bin from GMGC-Finder v{}\n'.format(__version__))
genome_bin.to_csv(ofile, sep='\t', index=False)

with atomic_write(out+'/hit_table.tsv', overwrite=True) as ofile:
ofile.write('# Results from GMGC-Finder v{}\n'.format(__version__))
hit_table.to_csv(ofile, sep='\t', index=False)
with atomic_write(out+'/hit_table.tsv', overwrite=True) as ofile:
ofile.write('# Results from GMGC-Finder v{}\n'.format(__version__))
hit_table.to_csv(ofile, sep='\t', index=False)

with atomic_write(out+'/summary.txt', overwrite=True) as ofile:
for s in summary:
print(s)
ofile.write(s+'\n')
with atomic_write(out+'/summary.txt', overwrite=True) as ofile:
for s in summary:
print(s)
ofile.write(s+'\n')


output_content = resource_string(__name__, 'output.md')
with atomic_write(out+'/README.md', overwrite=True) as ofile:
ofile.write(bytes.decode(output_content))
output_content = resource_string(__name__, 'output.md')
with atomic_write(out+'/README.md', overwrite=True) as ofile:
ofile.write(bytes.decode(output_content))

end = datetime.datetime.now()
end = datetime.datetime.now()

run_metadata = {}
run_metadata = {}

run_metadata['Command_line'] = command_line
run_metadata['GMGC-Finder'] = __version__
run_metadata['Start time'] = str(start)
run_metadata['End time'] = str(end)
run_metadata['Run time'] = (end-start).seconds
run_metadata['Inputs'] = []
run_metadata['Command_line'] = command_line
run_metadata['GMGC-Finder'] = __version__
run_metadata['Start time'] = str(start)
run_metadata['End time'] = str(end)
run_metadata['Run time'] = (end-start).seconds
run_metadata['Inputs'] = []

if args.genome_fasta is not None:
run_metadata['Inputs'].append(
{'genome_input': input_metadata(args.genome_fasta) })
if args.nt_input is not None:
run_metadata['Inputs'].append(
{'nt_input': input_metadata(args.nt_input)})
if args.aa_input is not None:
run_metadata['Inputs'].append(
{'aa_input': input_metadata(args.aa_input)})
if args.genome_fasta is not None:
run_metadata['Inputs'].append(
{'genome_input': input_metadata(args.genome_fasta) })
if args.nt_input is not None:
run_metadata['Inputs'].append(
{'nt_input': input_metadata(args.nt_input)})
if args.aa_input is not None:
run_metadata['Inputs'].append(
{'aa_input': input_metadata(args.aa_input)})

with atomic_write(out+'/runlog.yaml',overwrite=True) as ofile:
yaml.dump(run_metadata, ofile, default_flow_style=False)
with atomic_write(out+'/runlog.yaml',overwrite=True) as ofile:
yaml.dump(run_metadata, ofile, default_flow_style=False)
except Exception as e:
sys.stderr.write('GMGC-finder Error: ')
sys.stderr.write(str(e))
sys.stderr.write('\n')
sys.exit(1)



Expand Down

0 comments on commit 997118d

Please sign in to comment.