Skip to content

Commit

Permalink
IMP: add protein types and formats
Browse files Browse the repository at this point in the history
  • Loading branch information
misialq committed Oct 12, 2020
1 parent af9e2be commit b1e026a
Show file tree
Hide file tree
Showing 10 changed files with 509 additions and 15 deletions.
16 changes: 12 additions & 4 deletions q2_types/feature_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,17 @@
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialFormat,
DifferentialDirectoryFormat)
DifferentialDirectoryFormat, AlignedProteinSequencesDirectoryFormat,
ProteinSequencesDirectoryFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat)
from ._type import (
FeatureData, Taxonomy, Sequence, PairedEndSequence, AlignedSequence,
Differential)
Differential, ProteinSequence, AlignedProteinSequence)

# TODO remove these imports when tests are rewritten. Remove from __all__ too
from ._transformer import DNAIterator, PairedDNAIterator, AlignedDNAIterator
from ._transformer import (
DNAIterator, PairedDNAIterator, AlignedDNAIterator,
ProteinIterator, AlignedProteinIterator)

__all__ = [
'TaxonomyFormat', 'TaxonomyDirectoryFormat', 'HeaderlessTSVTaxonomyFormat',
Expand All @@ -30,6 +34,10 @@
'AlignedDNAFASTAFormat', 'AlignedDNASequencesDirectoryFormat',
'FeatureData', 'Taxonomy', 'Sequence', 'PairedEndSequence',
'AlignedSequence', 'DNAIterator', 'PairedDNAIterator',
'AlignedDNAIterator', 'Differential', 'DifferentialDirectoryFormat']
'AlignedDNAIterator', 'Differential', 'DifferentialDirectoryFormat',
'ProteinFASTAFormat', 'AlignedProteinFASTAFormat',
'ProteinSequence', 'ProteinSequencesDirectoryFormat',
'AlignedProteinSequence', 'AlignedProteinSequencesDirectoryFormat',
'ProteinIterator', 'AlignedProteinIterator']

importlib.import_module('q2_types.feature_data._transformer')
54 changes: 52 additions & 2 deletions q2_types/feature_data/_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import itertools
import re

import qiime2.plugin.model as model
import skbio
from qiime2.plugin import ValidationError
import qiime2

Expand Down Expand Up @@ -299,11 +300,60 @@ def validate(self, *args):
'DifferentialDirectoryFormat', 'differentials.tsv', DifferentialFormat)


class ProteinFASTAFormat(model.TextFileFormat):
def _validate_(self, level):
record_count_map = {'min': 5, 'max': None}
self._validate(record_count_map[level])

def _validate(self, n_records=None):
# read in using skbio and iterate over the contents -
# ValueErrors will be raised for wrong records
generator = self._read_protein_fasta(str(self))
if n_records is not None:
generator = itertools.islice(generator, n_records)
[x for x in generator]

def _read_protein_fasta(self, path):
return skbio.read(path, format='fasta', constructor=skbio.Protein)


ProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'ProteinSequencesDirectoryFormat',
'protein-sequences.fasta',
ProteinFASTAFormat)


class AlignedProteinFASTAFormat(model.TextFileFormat):
def _validate_(self, level):
record_count_map = {'min': 5, 'max': None}
self._validate(record_count_map[level])

def _validate(self, n_records=None):
# read in using skbio and iterate over the contents -
# ValueErrors will be raised for wrong records
generator = self._read_protein_alignment_fasta(str(self))
if n_records is not None:
generator = itertools.islice(generator, n_records)
[x for x in generator]

def _read_protein_alignment_fasta(self, path):
return skbio.read(path, format='fasta',
constructor=skbio.Protein, into=skbio.TabularMSA)


AlignedProteinSequencesDirectoryFormat = model.SingleFileDirectoryFormat(
'AlignedProteinSequencesDirectoryFormat',
'aligned-protein-sequences.fasta',
AlignedProteinFASTAFormat)


plugin.register_formats(
TSVTaxonomyFormat, TSVTaxonomyDirectoryFormat,
HeaderlessTSVTaxonomyFormat, HeaderlessTSVTaxonomyDirectoryFormat,
TaxonomyFormat, TaxonomyDirectoryFormat, DNAFASTAFormat,
DNASequencesDirectoryFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, AlignedDNASequencesDirectoryFormat,
DifferentialFormat, DifferentialDirectoryFormat
DifferentialFormat, DifferentialDirectoryFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat, ProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat,
)
127 changes: 124 additions & 3 deletions q2_types/feature_data/_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
from ..feature_table import BIOMV210Format
from . import (TaxonomyFormat, HeaderlessTSVTaxonomyFormat, TSVTaxonomyFormat,
DNAFASTAFormat, PairedDNASequencesDirectoryFormat,
AlignedDNAFASTAFormat, DifferentialFormat)
AlignedDNAFASTAFormat, DifferentialFormat, ProteinFASTAFormat,
AlignedProteinFASTAFormat)


# Taxonomy format transformers
Expand Down Expand Up @@ -262,10 +263,14 @@ def _dnafastaformats_to_metadata(ff):
return qiime2.Metadata(df)


def _series_to_fasta_format(ff, data):
def _series_to_fasta_format(ff, data, sequence_type="DNA"):
with ff.open() as f:
for id_, seq in data.iteritems():
sequence = skbio.DNA(seq, metadata={'id': id_})
sequence = skbio.Protein(
seq, metadata={
'id': id_}) if sequence_type == "protein" else skbio.DNA(
seq, metadata={
'id': id_})
skbio.io.write(sequence, format='fasta', into=f)


Expand Down Expand Up @@ -402,3 +407,119 @@ def _224(data: pd.DataFrame) -> DifferentialFormat:
ff = DifferentialFormat()
qiime2.Metadata(data).save(str(ff))
return ff


# Protein FASTA transformers

class ProteinIterator(collections.abc.Iterable):
def __init__(self, generator):
self.generator = generator

def __iter__(self):
yield from self.generator


class AlignedProteinIterator(ProteinIterator):
pass


def _read_protein_fasta(path):
return skbio.read(path, format='fasta', constructor=skbio.Protein)


def _proteinfastaformats_to_series(ff):
data = {}
for sequence in _read_protein_fasta(str(ff)):
id_ = sequence.metadata['id']
if id_ in data:
raise ValueError("FASTA format sequence IDs must be unique. The "
"following ID was found more than once: %s."
% id_)
data[id_] = sequence
return pd.Series(data)


def _proteinfastaformats_to_metadata(ff):
df = _proteinfastaformats_to_series(ff).to_frame()
df = df.astype(str)
df.index.name, df.columns = 'Feature ID', ['Sequence']
return qiime2.Metadata(df)


@plugin.register_transformer
def _37(ff: ProteinFASTAFormat) -> ProteinIterator:
generator = _read_protein_fasta(str(ff))
return ProteinIterator(generator)


@plugin.register_transformer
def _38(data: ProteinIterator) -> ProteinFASTAFormat:
ff = ProteinFASTAFormat()
skbio.io.write(iter(data), format='fasta', into=str(ff))
return ff


@plugin.register_transformer
def _39(ff: AlignedProteinFASTAFormat) -> skbio.TabularMSA:
return skbio.TabularMSA.read(str(ff), constructor=skbio.Protein,
format='fasta')


@plugin.register_transformer
def _40(data: skbio.TabularMSA) -> AlignedProteinFASTAFormat:
ff = AlignedProteinFASTAFormat()
data.write(str(ff), format='fasta')
return ff


@plugin.register_transformer
def _41(ff: ProteinFASTAFormat) -> pd.Series:
return _proteinfastaformats_to_series(ff)


@plugin.register_transformer
def _42(ff: ProteinFASTAFormat) -> qiime2.Metadata:
return _proteinfastaformats_to_metadata(ff)


@plugin.register_transformer
def _43(data: pd.Series) -> ProteinFASTAFormat:
ff = ProteinFASTAFormat()
_series_to_fasta_format(ff, data, "protein")
return ff


@plugin.register_transformer
def _44(ff: AlignedProteinFASTAFormat) -> AlignedProteinIterator:
generator = _read_protein_fasta(str(ff))
return AlignedProteinIterator(generator)


@plugin.register_transformer
def _45(data: AlignedProteinIterator) -> AlignedProteinFASTAFormat:
ff = AlignedProteinFASTAFormat()
skbio.io.write(iter(data), format='fasta', into=str(ff))
return ff


@plugin.register_transformer
def _46(ff: AlignedProteinFASTAFormat) -> qiime2.Metadata:
return _proteinfastaformats_to_metadata(ff)


@plugin.register_transformer
def _47(ff: AlignedProteinFASTAFormat) -> pd.Series:
return _proteinfastaformats_to_series(ff)


@plugin.register_transformer
def _48(data: pd.Series) -> AlignedProteinFASTAFormat:
ff = AlignedProteinFASTAFormat()
_series_to_fasta_format(ff, data, "protein")
return ff


@plugin.register_transformer
def _49(fmt: AlignedProteinFASTAFormat) -> ProteinIterator:
generator = _read_protein_fasta(str(fmt))
return ProteinIterator(generator)
18 changes: 16 additions & 2 deletions q2_types/feature_data/_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
from . import (TSVTaxonomyDirectoryFormat, DNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat,
AlignedDNASequencesDirectoryFormat,
DifferentialDirectoryFormat)
DifferentialDirectoryFormat, ProteinSequencesDirectoryFormat,
AlignedProteinSequencesDirectoryFormat)


FeatureData = SemanticType('FeatureData', field_names='type')
Expand All @@ -30,9 +31,16 @@
Differential = SemanticType('Differential',
variant_of=FeatureData.field['type'])

ProteinSequence = SemanticType('ProteinSequence',
variant_of=FeatureData.field['type'])

AlignedProteinSequence = SemanticType('AlignedProteinSequence',
variant_of=FeatureData.field['type'])

plugin.register_semantic_types(FeatureData, Taxonomy, Sequence,
PairedEndSequence, AlignedSequence,
Differential)
Differential, ProteinSequence,
AlignedProteinSequence)


plugin.register_semantic_type_to_format(
Expand All @@ -49,3 +57,9 @@
artifact_format=AlignedDNASequencesDirectoryFormat)
plugin.register_semantic_type_to_format(
FeatureData[Differential], DifferentialDirectoryFormat)
plugin.register_semantic_type_to_format(
FeatureData[ProteinSequence],
artifact_format=ProteinSequencesDirectoryFormat)
plugin.register_semantic_type_to_format(
FeatureData[AlignedProteinSequence],
artifact_format=AlignedProteinSequencesDirectoryFormat)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>sequence1
------------------------VDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQ-------VGEFSGAN
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>sequence1
MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
AAAQIR
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN
>sequence1
AFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQDQKVGEFSGA
6 changes: 6 additions & 0 deletions q2_types/feature_data/tests/data/protein-sequences.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
>sequence1
MTTRDLTAAQFNETIQSSDMVLVDYWASWCGPCRAFAPTFAESSEKHPDVVHAKVDTEAERELA
AAAQIR
>sequence2
MVKQIESKTAFQEALDAAGDKLVVVDFSATWCGPCKMIKPFFHSLSEKYSNVIFLEVDVDDCQD
VASECEVKCMPTFQFFKKGQKVGEFSGAN
62 changes: 61 additions & 1 deletion q2_types/feature_data/tests/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
HeaderlessTSVTaxonomyDirectoryFormat, TSVTaxonomyFormat,
TSVTaxonomyDirectoryFormat, DNAFASTAFormat, DNASequencesDirectoryFormat,
PairedDNASequencesDirectoryFormat, AlignedDNAFASTAFormat,
AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat
AlignedDNASequencesDirectoryFormat, DifferentialDirectoryFormat,
ProteinFASTAFormat, AlignedProteinFASTAFormat,
AlignedProteinSequencesDirectoryFormat, ProteinSequencesDirectoryFormat
)
from qiime2.plugin.testing import TestPluginBase
from qiime2.plugin import ValidationError
Expand Down Expand Up @@ -342,5 +344,63 @@ def test_differential_format_bad_type(self):
format.validate()


class TestProteinFASTAFormats(TestPluginBase):
package = 'q2_types.feature_data.tests'

def test_protein_fasta_format_validate_positive(self):
filepath = self.get_data_path('protein-sequences.fasta')
format = ProteinFASTAFormat(filepath, mode='r')

format.validate()

def test_protein_fasta_format_invalid_characters(self):
filepath = self.get_data_path('not-dna-sequences.fasta')
format = ProteinFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(
ValueError, "Invalid characters in sequence"):
format.validate()

def test_protein_fasta_format_empty_file(self):
filepath = os.path.join(self.temp_dir.name, 'empty')
with open(filepath, 'w') as fh:
fh.write('\n')
format = ProteinFASTAFormat(filepath, mode='r')

format.validate()

def test_protein_sequences_directory_format(self):
filepath = self.get_data_path('protein-sequences.fasta')
shutil.copy(filepath,
os.path.join(
self.temp_dir.name, 'protein-sequences.fasta'))
format = ProteinSequencesDirectoryFormat(self.temp_dir.name, mode='r')

format.validate()

def test_aligned_protein_fasta_format_validate_positive(self):
filepath = self.get_data_path('aligned-protein-sequences.fasta')
format = AlignedProteinFASTAFormat(filepath, mode='r')

format.validate()

def test_aligned_protein_fasta_format_unaligned(self):
filepath = self.get_data_path('protein-sequences.fasta')
format = AlignedProteinFASTAFormat(filepath, mode='r')

with self.assertRaisesRegex(ValueError,
'length must match.* 93.* 70'):
format.validate()

def test_aligned_protein_sequences_directory_format(self):
filepath = self.get_data_path('aligned-protein-sequences.fasta')
temp_dir = self.temp_dir.name
shutil.copy(filepath,
os.path.join(temp_dir, 'aligned-protein-sequences.fasta'))
format = AlignedProteinSequencesDirectoryFormat(temp_dir, mode='r')

format.validate()


if __name__ == '__main__':
unittest.main()
Loading

0 comments on commit b1e026a

Please sign in to comment.