Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhancements in vfdb_parser.py for VFDB full dataset support #320

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 141 additions & 5 deletions ariba/vfdb_parser.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import re
import pyfastaq
import gzip
import pandas as pd
import subprocess
import os
from ariba import common

class Error (Exception): pass

name_regex = re.compile(r'^(?P<vfdb_id>V\S+\)) \((?P<name>\S+)\) (?P<description>.*\]) \[(?P<genus_etc>.*)\]$')
name_regex = re.compile(r'^(?P<vfdb_id>V\S+) \((?P<name>.*?)\) (?P<description>.*\]) \[(?P<genus_etc>.*)\]$')
desc_regex = re.compile(r'^[^[]*\[.*\((?P<vf_id>V\S+)\) .*\]$')

class VfdbParser:
def __init__(self, infile, outprefix):
Expand All @@ -27,22 +33,152 @@ def _fa_header_to_name_and_metadata(fa_header):
return fa_header, '.'
else:
vfdb_id, name, description, genus_etc = name_data
return name + '.' + vfdb_id + '.' + genus_etc.replace(' ', '_'), description
vf_id = VfdbParser._name_piece_description_to_vfid(description)
return name.replace(' ', '_') + '.' + vfdb_id + '.' + genus_etc.replace(' ', '_'), vf_id


@classmethod
def _name_piece_description_to_vfid(cls, description):
n = desc_regex.search(description)
if n is None:
return description
else:
return n.group('vf_id')


@classmethod
def _load_VFs_xls_metadata(cls, outprefix):
filename = 'VFs.xls.gz'
# get tmpdir as defined in ref_genes_getter.py
outprefix = os.path.abspath(outprefix)
tmpdir = outprefix + '.tmp.download'

if not os.path.exists(tmpdir):
try:
os.mkdir(tmpdir)
except:
raise Error('Error mkdir ' + tmpdir)

zipfile = os.path.join(tmpdir, filename)
common.download_file('http://www.mgc.ac.cn/VFs/Down/' + filename, zipfile)

retcode = subprocess.call('gunzip -t ' + zipfile, shell=True)
if retcode != 0:
raise Error("Error opening for reading gzipped file '" + filename + "'")

with gzip.open(zipfile, 'r') as handle:
vfdb_meta_df = pd.read_excel(handle, header = 1, usecols = ['VFID', 'VF_Name', 'VFcategory', 'Function', 'Mechanism'], keep_default_na = False)
vfdb_meta_df['description'] = vfdb_meta_df['VF_Name']+' ('+vfdb_meta_df['VFcategory']+'). '+vfdb_meta_df['Function']+' '+vfdb_meta_df['Mechanism']
vfid_dict = dict(zip(vfdb_meta_df['VFID'], vfdb_meta_df['description']))
del(vfdb_meta_df)

return vfid_dict


@classmethod
def _vfid_to_metadata(cls, vfid_dict, vf_id):
if vf_id is not None:
vf_metadata = vfid_dict.get(vf_id)
if vf_metadata is None:
return ''
else:
return vf_metadata
else:
return ''


def run(self):
file_reader = pyfastaq.sequences.file_reader(self.infile)
fa_out = pyfastaq.utils.open_file_write(self.outprefix + '.fa')
tsv_out = pyfastaq.utils.open_file_write(self.outprefix + '.tsv')

vfid_dict = self._load_VFs_xls_metadata(self.outprefix)
seqid_list = []

# reporting variables
count_noncoding = 0
max_length_coding = 0
max_length_noncoding = 0

for seq in file_reader:
original_id = seq.id
seq.id, description = self._fa_header_to_name_and_metadata(seq.id)
vf_metadata = self._vfid_to_metadata(vfid_dict, description)
if description == '.':
seq.id = original_id.split()[0]
print(seq.id, '1', '0', '.', '.', 'Original name: ' + original_id, sep='\t', file=tsv_out)
print(seq, file=fa_out)

# remove sequences with duplicate ids
seqid = seq.id.split()[0]

if seqid in seqid_list:
print('Duplicate entry ' + seqid + ' removed from downloaded dataset.')
continue
seqid_list.append(seqid)

seq_length = len(seq)

# check if sequence is indeed coding
pyfastaq.sequences.genetic_code = 11
got = seq.make_into_gene()
if got is None:
# sequence is not coding

# check if sequence contains redundant nts and expand
to_expand = False
for nt in pyfastaq.sequences.redundant_nts:

hits = seq.search(nt)
if hits == []:
continue
print('Found redundant nts in ' + seqid + '\texpanding...')
to_expand = True
break

if to_expand:
expand_list = seq.expand_nucleotides()
for expanded_seq in expand_list:
# check if sequence is coding now
got = expanded_seq.make_into_gene()
if got is None:
# sequence is still not coding
count_noncoding += 1
if seq_length > max_length_noncoding:
max_length_noncoding = seq_length

print(expanded_seq.id, '0', '0', '.', '.', description + ': '+ vf_metadata + ' Original name: ' + original_id, sep='\t', file=tsv_out)

else:
# sequence is coding now
if seq_length > max_length_coding:
max_length_coding = seq_length

print(expanded_seq.id, '1', '0', '.', '.', description + ': '+ vf_metadata + ' Original name: ' + original_id, sep='\t', file=tsv_out)

print(expanded_seq, file=fa_out)

else:
# not to_expand
count_noncoding += 1
if seq_length > max_length_noncoding:
max_length_noncoding = seq_length

print(seq.id, '0', '0', '.', '.', description + ': '+ vf_metadata + ' Original name: ' + original_id, sep='\t', file=tsv_out)
print(seq, file=fa_out)

else:
# sequence is coding
if seq_length > max_length_coding:
max_length_coding = seq_length

print(seq.id, '1', '0', '.', '.', description + ': '+ vf_metadata + ' Original name: ' + original_id, sep='\t', file=tsv_out)
print(seq, file=fa_out)

pyfastaq.utils.close(fa_out)
pyfastaq.utils.close(tsv_out)


print('A total number of ' + str(count_noncoding) + ' sequences in the dataset have been declared as noncoding in ' + self.outprefix + '.tsv')
# report max_seq_length if greater default value
if max_length_coding > 10000:
print('Observed long coding sequences, ariba prepareref should be run with --max_gene_length ' + str(max_length_coding))
if max_length_noncoding > 20000:
print('Observed long noncoding sequences, ariba prepareref should be run with --max_noncoding_length ' + str(max_length_coding))