Skip to content

Commit

Permalink
update some details
Browse files Browse the repository at this point in the history
  • Loading branch information
psj1997 committed Jun 11, 2020
1 parent fa1c8e6 commit ab7c570
Show file tree
Hide file tree
Showing 6 changed files with 207 additions and 49 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,14 @@ Input is genome sequence.
gmgc-finder -i input.fasta -o output
```

Input is DNA/protein gene sequence.
Input is DNA/protein gene sequence(You can just input the protein gene sequences).

```bash
gmgc-finder -nt_input genes.fna -aa_input genes.faa -o output
```
```bash
gmgc-finder -aa_input genes.faa -o output
```

If yout input is a metagenome, you can use
[NGLess](https://github.com/ngless-toolkit/ngless) for assembly and gene
Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ categories and genome bins.

* `-aa_input` : path to the input Protein gene file(.fasta/.gz/.bz2).

The input must contain a genome file or both DNA and Protein gene file.
The input must contain a genome file or DNA and Protein gene file or just Protein gene file.

6 changes: 4 additions & 2 deletions docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

* `-aa_input` : path to the input Protein gene file(.fasta/.gz/.bz2).

The input must contain a genome file or both DNA and Protein gene file.
The input must contain a genome file or DNA and Protein gene file or just Protein gene file.

## Examples

Expand All @@ -28,7 +28,9 @@ GMGC-finder will call `prodigal` to predict genes and then process each gene.
```bash
gmgc-finder -nt_input genes.fna -aa_input genes.faa -o output
```

```bash
gmgc-finder -aa_input genes.faa -o output
```
# Processing metagenomes using NGLess

If your input is metagenome, you can use
Expand Down
19 changes: 11 additions & 8 deletions gmgc_finder/alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,16 @@ def category(query, dna_seq, protein_seq):
if identity_coverage(query, protein_seq) >= (0.5, 0.5): return "MATCH"
return "NO MATCH"
"""
try:
sw_dna = skbio.alignment.local_pairwise_align_ssw(DNA(dna_query),DNA(dna_target))
except:
sw_dna = skbio.alignment.local_pairwise_align_nucleotide(DNA(dna_query), DNA(dna_target))
dna_identity,align_length = extract_sw(sw_dna)
dna_coverage = align_length / min(len(dna_query),len(dna_target))
if dna_identity >= 0.95 and dna_coverage >= 0.95:
if dna_query != '':
try:
sw_dna = skbio.alignment.local_pairwise_align_ssw(DNA(dna_query),DNA(dna_target))
except:
sw_dna = skbio.alignment.local_pairwise_align_nucleotide(DNA(dna_query), DNA(dna_target))
dna_identity,align_length = extract_sw(sw_dna)
dna_coverage = align_length / min(len(dna_query),len(dna_target))
if dna_identity >= 0.95 and dna_coverage >= 0.95:

return 'EXACT'
return 'EXACT'

else:
try:
Expand All @@ -50,3 +51,5 @@ def category(query, dna_seq, protein_seq):
return 'MATCH'

return 'NO MATCH'


219 changes: 184 additions & 35 deletions gmgc_finder/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@
import requests
import pandas as pd
import json
import time
from safeout import safeout
from tqdm import tqdm
import gzip
import bz2
import subprocess
from os import path
import tempfile

from pkg_resources import resource_string
import datetime
import time
import hashlib
from .alignment import identity_coverage
from .gmgc_finder_version import __version__

Expand All @@ -30,9 +32,9 @@ def parse_args(args):
help='Output directory (will be created if non-existent)',
dest='output',
default = None)
parser.add_argument('-nt_input',required=False,help = 'Path to the input DNA gene file.',dest='nt_input',
parser.add_argument('-nt_genes',required=False,help = 'Path to the input DNA gene file.',dest='nt_input',
default = None)
parser.add_argument('-aa_input',required=False,help = 'Path to the input Protein gene file.',dest='aa_input',
parser.add_argument('-aa_genes',required=False,help = 'Path to the input Protein gene file.',dest='aa_input',
default = None)
return parser.parse_args()

Expand Down Expand Up @@ -141,27 +143,37 @@ def query_gmgc(fasta_file,max_try = 10):
return None




def realignment(dna_path,aa_path,besthit):
def alignment(record_dna,record_aa,hit_index,dna = False):
hit_result = []
if dna == True:
query_dna = str(record_dna.seq)
else:
query_dna = ''
query_aa = str(record_aa.seq)
query_name = hit_index['query_name']
if hit_index['hits'] != []:
hit_gene_id = hit_index['hits'][0]['gene_id']
target_dna = hit_index['hits'][0]['dna_sequence']
target_aa = hit_index['hits'][0]['protein_sequence']
if target_aa and target_dna is not None:
realign_result = identity_coverage(query_dna,query_aa,target_dna,target_aa)
hit_result.extend([query_name,hit_gene_id,realign_result,target_dna,target_aa])
else:
hit_result.extend([query_name, None, 'NO HIT', None, None])
return hit_result

results = []
for record_dna, record_aa, hit_index in zip(SeqIO.parse(dna_path,'fasta'),
SeqIO.parse(aa_path,'fasta'),
besthit):
hit_result = []
query_dna = str(record_dna.seq)
query_aa = str(record_aa.seq)
query_name = hit_index['query_name']
if hit_index['hits'] != []:
hit_gene_id = hit_index['hits'][0]['gene_id']
target_dna = hit_index['hits'][0]['dna_sequence']
target_aa = hit_index['hits'][0]['protein_sequence']
if target_aa and target_dna is not None:
realign_result = identity_coverage(query_dna,query_aa,target_dna,target_aa)
hit_result.extend([query_name,hit_gene_id,realign_result,target_dna,target_aa])
results.append(hit_result)
else:
hit_result.extend([query_name, None, 'NO HIT', None, None])
if dna_path is not None:
for record_dna, record_aa, hit_index in zip(SeqIO.parse(dna_path,'fasta'),
SeqIO.parse(aa_path,'fasta'),
besthit):
hit_result = alignment(record_dna,record_aa,hit_index,dna = True)
results.append(hit_result)
else:
for record_aa, hit_index in zip(SeqIO.parse(aa_path,'fasta'),
besthit):
hit_result = alignment('',record_aa,hit_index,dna=False)
results.append(hit_result)
return results

Expand Down Expand Up @@ -195,39 +207,69 @@ def query_genome_bin(hit_table):
genome_bin = genome_bin.reset_index().rename(columns={'index':'genome_bin'})
return genome_bin

def CalSha256(filname):
with open(filname, "rb") as f:
sha256obj = hashlib.sha256()
sha256obj.update(f.read())
hash_value = sha256obj.hexdigest()
return hash_value

def convert_command(command):
command_line = 'gmgc-finder '
for parameter in command:
if command[parameter] is not None:
command_line = command_line + '-'+ parameter + ' '
command_line = command_line + command[parameter] + ' '
return command_line

def main(args=None):

start = datetime.datetime.now()

if args is None:
args = sys.argv

args = parse_args(args)

command_args = vars(args)
command_line = convert_command(command_args)
out = args.output
if not os.path.exists(out):
os.makedirs(out)

with tempfile.TemporaryDirectory() as tmpdirname:
if args.nt_input is None or args.aa_input is None:
if args.genome_fasta is None:
raise Exception("Need to input both dna and protein gene file or a genome file!")
if args.genome_fasta is not None:
gene_prediction(args.genome_fasta,out)

split_file(out + '/prodigal_out.faa',
output_dir=tmpdirname + '/split_file',is_dna=False)
num_split = split_file(out + '/prodigal_out.fna',
output_dir=tmpdirname + '/split_file',is_dna=True)
else:
assert gene_num(args.nt_input) == gene_num(args.aa_input), "Input dna and protein gene file must have the same sequence number."
split_file(args.aa_input,
output_dir=tmpdirname + '/split_file',is_dna=False)
num_split = split_file(args.nt_input,
output_dir=tmpdirname + '/split_file',is_dna=True)
if args.aa_input is None:
raise Exception("Need to input protein gene file or a genome file!")

else:
if args.nt_input is not None:
assert gene_num(args.nt_input) == gene_num(args.aa_input), "Input dna and protein gene file must have the same sequence number."
split_file(args.aa_input,
output_dir=tmpdirname + '/split_file',is_dna=False)
num_split = split_file(args.nt_input,
output_dir=tmpdirname + '/split_file',is_dna=True)

else:
num_split = split_file(args.aa_input,
output_dir=tmpdirname + '/split_file',is_dna=False)
hit_table = []
print('Starting GMGC queries (total: {} batches to process)'.format(num_split))
for index in tqdm(range(num_split)):
besthit = query_gmgc(tmpdirname+'/split_file/split_{}.faa'.format(index+1))
if besthit is not None:
besthit = json.loads(bytes.decode(besthit.content))['results']
hit_table_index = realignment(tmpdirname+'/split_file/split_{}.fna'.format(index+1),
tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
if args.nt_input is not None:
hit_table_index = realignment(tmpdirname+'/split_file/split_{}.fna'.format(index+1),
tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
else:
hit_table_index = realignment(None,tmpdirname+'/split_file/split_{}.faa'.format(index+1),besthit)
hit_table.extend(hit_table_index)
hit_table = pd.DataFrame(hit_table)
hit_table.columns = ['query_name','gene_id','align_category','gene_dna','gene_protein']
Expand Down Expand Up @@ -282,7 +324,114 @@ def main(args=None):
print(s)
ofile.write(s+'\n')

subprocess.call('cp docs/output.md {}'.format(out), shell=True)

output_content = resource_string('gmgc_finder', 'output.md')
with safeout(out+'/README.md', 'wt') as ofile:
ofile.write(bytes.decode(output_content))

end = datetime.datetime.now()

output_log = []
output_dict = {}
output_log.append('Command_line: '+command_line)
output_log.append('GMGC-Finder: '+ __version__)

output_log.append('Start time: '+str(start))
output_log.append('End time: '+str(end))
output_log.append('Run time: '+str((end-start).seconds))

output_dict['Command_line'] = command_line
output_dict['GMGC-Finder'] = __version__
output_dict['Start time'] = str(start)
output_dict['End time'] = str(end)
output_dict['Run time'] = str((end-start).seconds)

output_log.append('\n# Inputs')

if args.genome_fasta is not None:
input_name = os.path.basename(args.genome_fasta)
full_path = os.path.abspath(args.genome_fasta)
mtime = time.ctime(os.path.getmtime(args.genome_fasta))
file_size = str(os.path.getsize(args.genome_fasta))
sha256 = CalSha256(args.genome_fasta)
output_log.append('-input_name: '+input_name)
output_log.append('-full_path: '+full_path)
output_log.append('-mtime: '+mtime)
output_log.append('-file size: '+ file_size)
output_log.append('-sha256 '+ sha256)
output_dict['#Inputs'] = {}
output_dict['#Inputs']['input_name'] = input_name
output_dict['#Inputs']['full_path'] = full_path
output_dict['#Inputs']['mtime'] = str(mtime)
output_dict['#Inputs']['file size'] = file_size
output_dict['#Inputs']['sha256'] = sha256
else:
if args.nt_input is not None:
input_name_nt = os.path.basename(args.nt_input)
full_path_nt = os.path.abspath(args.nt_input)
mtime_nt = time.ctime(os.path.getmtime(args.nt_input))
file_size_nt = str(os.path.getsize(args.nt_input))
sha256_nt = CalSha256(args.nt_input)

input_name_aa = os.path.basename(args.aa_input)
full_path_aa = os.path.abspath(args.aa_input)
mtime_aa = time.ctime(os.path.getmtime(args.aa_input))
file_size_aa = str(os.path.getsize(args.aa_input))
sha256_aa = CalSha256(args.aa_input)

output_log.append('-nt_input_name: '+input_name_nt)
output_log.append('-nt_full_path_nt: '+full_path_nt)
output_log.append('-nt_mtime: '+mtime_nt)
output_log.append('-nt_file size: '+ file_size_nt)
output_log.append('-nt_sha256 '+ sha256_nt+'\n')

output_log.append('-aa_input_name: '+input_name_aa)
output_log.append('-aa_full_path: '+full_path_aa)
output_log.append('-aa_mtime: '+mtime_aa)
output_log.append('-aa_file size: '+ file_size_aa)
output_log.append('-aa_sha256 '+ sha256_aa)

output_dict['#Inputs'] = {}

output_dict['#Inputs']['nt_input_name'] = input_name_nt
output_dict['#Inputs']['nt_full_path'] = full_path_nt
output_dict['#Inputs']['nt_mtime'] = str(mtime_nt)
output_dict['#Inputs']['nt_file size'] = file_size_nt
output_dict['#Inputs']['nt_sha256'] = sha256_nt

output_dict['#Inputs']['aa_input_name'] = input_name_aa
output_dict['#Inputs']['aa_full_path'] = full_path_aa
output_dict['#Inputs']['aa_mtime'] = str(mtime_aa)
output_dict['#Inputs']['aa_file size'] = file_size_aa
output_dict['#Inputs']['aa_sha256'] = sha256_aa

else:
input_name_aa = os.path.basename(args.aa_input)
full_path_aa = os.path.abspath(args.aa_input)
mtime_aa = time.ctime(os.path.getmtime(args.aa_input))
file_size_aa = str(os.path.getsize(args.aa_input))
sha256_aa = CalSha256(args.aa_input)
output_log.append('-aa_input_name: '+input_name_aa)
output_log.append('-aa_full_path: '+full_path_aa)
output_log.append('-aa_mtime: '+mtime_aa)
output_log.append('-aa_file size: '+ file_size_aa)
output_log.append('-aa_sha256 '+ sha256_aa)
output_dict['#Inputs'] = {}
output_dict['#Inputs']['aa_input_name'] = input_name_aa
output_dict['#Inputs']['aa_full_path'] = full_path_aa
output_dict['#Inputs']['aa_mtime'] = str(mtime_aa)
output_dict['#Inputs']['aa_file size'] = file_size_aa
output_dict['#Inputs']['aa_sha256'] = sha256_aa



with safeout(out+'/runlog.txt', 'wt') as ofile:
for s in output_log:
ofile.write(s+'\n')

with safeout(out+'/runlog.json','wt') as ofile:
output_json = json.dumps(output_dict)
ofile.write(output_json)



Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from setuptools import setup
from setuptools import setup,find_packages

exec(compile(open('gmgc_finder/gmgc_finder_version.py').read(),
'gmgc_finder/gmgc_finder_version.py', 'exec'))
Expand All @@ -22,7 +22,8 @@
'safeout',
'tqdm',
],
include_package_data=True,
package_data={
'docs': ['*.md']},
zip_safe=False,
entry_points={
'console_scripts': ['gmgc-finder=gmgc_finder.main:main'],
Expand Down

0 comments on commit ab7c570

Please sign in to comment.